File size: 6,471 Bytes
25e3c44
 
 
c679076
25e3c44
 
 
 
 
 
b86781d
 
 
 
 
25e3c44
 
b86781d
 
 
 
 
25e3c44
 
 
 
c679076
a3ab355
25e3c44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a3ab355
 
25e3c44
 
 
 
 
b86781d
25e3c44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77c9a6d
 
25e3c44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b86781d
 
 
 
 
 
 
77517d8
 
 
b86781d
 
a3ab355
 
77517d8
 
 
a3ab355
 
b86781d
 
 
77517d8
b86781d
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import pandas as pd
import numpy as np
import xgboost as xgb
import streamlit as st
import requests
from bs4 import BeautifulSoup
from gensim.models import FastText
import joblib

# Load the trained FastText model
try:
    fasttext_model = FastText.load('fasttext_model.bin')
except FileNotFoundError:
    st.error("The FastText model file was not found. Please ensure 'fasttext_model.bin' and its associated files are in the correct directory.")
    st.stop()

# Load the trained XGBoost model for the combined features
try:
    model = joblib.load('model.pkl')
except FileNotFoundError:
    st.error("The XGBoost model file was not found. Please ensure 'model.pkl' is in the correct directory.")
    st.stop()

def tokenize(text):
    if isinstance(text, str):
        return text.split()
    else:
        return []

def embed_text(text_series, fasttext_model):
    embeddings = []
    for text in text_series:
        tokens = tokenize(text)
        vectors = [fasttext_model.wv[token] for token in tokens if token in fasttext_model.wv]
        if vectors:
            embeddings.append(np.mean(vectors, axis=0))
        else:
            embeddings.append(np.zeros(fasttext_model.vector_size))
    return np.array(embeddings)

def preprocess_input(query, title, description, url, fasttext_model):
    query = str(query) if pd.notna(query) else ''
    title = str(title) if pd.notna(title) else ''
    description = str(description) if pd.notna(description) else ''
    url = str(url) if pd.notna(url) else ''

    query_ft = embed_text(pd.Series([query]), fasttext_model)
    title_ft = embed_text(pd.Series([title]), fasttext_model)
    description_ft = embed_text(pd.Series([description]), fasttext_model)
    url_ft = embed_text(pd.Series([url]), fasttext_model)

    combined_features = np.hstack([query_ft, title_ft, description_ft, url_ft])

    dmatrix = xgb.DMatrix(combined_features)
    return dmatrix

def extract_title_description(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        title = soup.title.string if soup.title else 'No title found'
        description_tag = soup.find('meta', attrs={'name': 'description'})
        description = description_tag['content'] if description_tag else 'No description found'
        return title, description
    except Exception as e:
        return 'Error extracting title', 'Error extracting description'

def predict(query, title, description, url, fasttext_model):
    dmatrix = preprocess_input(query, title, description, url, fasttext_model)
    probability = model.predict(dmatrix, validate_features=False)[0]
    binary_prediction = int(probability >= 0.5)
    return binary_prediction, probability

# Streamlit interface
st.title('CTR Prediction Inference')

tab1, tab2, tab3 = st.tabs(["Single Entry", "Batch Entry", "A/B Test"])

with tab1:
    st.header('Single Entry Inference')
    
    query = st.text_input('Query')
    url = st.text_input('URL')

    if st.button('Predict'):
        title, description = extract_title_description(url)
        st.write(f'Extracted Title: {title}')
        st.write(f'Extracted Description: {description}')
        
        if query and url:
            binary_result, confidence = predict(query, title, description, url, fasttext_model)
            st.write(f'Predicted +/-: {binary_result}')
            st.write(f'Conf.: {confidence:.2%}')
            confidence_percentage = int(confidence * 100)
            st.progress(confidence_percentage)
        else:
            st.write('Please enter both a query and a URL.')

with tab2:
    st.header('Batch Entry Inference')
    
    uploaded_file = st.file_uploader("Upload CSV", type="csv")
    
    if uploaded_file is not None:
        df = pd.read_csv(uploaded_file)
        required_columns = ['Query', 'Title', 'Description', 'URL']
        
        if set(required_columns).issubset(df.columns):
            predictions = []
            confidences = []
            for index, row in df.iterrows():
                binary_result, confidence = predict(row['Query'], row['Title'], row['Description'], row['URL'], fasttext_model)
                predictions.append(binary_result)
                confidences.append(confidence)
            
            df['+/-'] = predictions
            df['Conf.'] = [f"{conf:.2%}" for conf in confidences]
            
            cols = ['+/-', 'Conf.'] + [col for col in df.columns if col not in ['+/-', 'Conf.']]
            df = df[cols]
            
            st.write(df)
            st.download_button("Download Predictions", df.to_csv(index=False), "predictions.csv")
        else:
            st.write('CSV must contain Query, Title, Description, and URL columns.')

with tab3:
    st.header('A/B Test Inference')

    query = st.text_input('Query for A/B Test')
    url = st.text_input('URL for A/B Test')

    if 'step' not in st.session_state:
        st.session_state.step = 0

    if st.button('Scrape A/B'):
        title_A, description_A = extract_title_description(url)
        st.session_state['title_A'] = title_A
        st.session_state['description_A'] = description_A
        st.session_state.step = 1

    if st.session_state.step == 1:
        title_B = st.text_input('Title B', value=st.session_state.get('title_A', ''))
        description_B = st.text_area('Description B', value=st.session_state.get('description_A', ''))

        if st.button('Predict A/B'):
            if query and url:
                binary_result_A, confidence_A = predict(query, st.session_state['title_A'], st.session_state['description_A'], url, fasttext_model)
                binary_result_B, confidence_B = predict(query, title_B, description_B, url, fasttext_model)
                
                st.write(f'Results for A: Predicted +/-: {binary_result_A}, Conf.: {confidence_A:.2%}')
                st.write(f'Results for B: Predicted +/-: {binary_result_B}, Conf.: {confidence_B:.2%}')
                
                if binary_result_A == 1 and binary_result_B == 0:
                    st.write("B is worse than A")
                elif binary_result_A == 0 and binary_result_B == 1:
                    st.write("B is better than A")
                else:
                    st.write("B is the same as A")