File size: 6,471 Bytes
25e3c44 c679076 25e3c44 b86781d 25e3c44 b86781d 25e3c44 c679076 a3ab355 25e3c44 a3ab355 25e3c44 b86781d 25e3c44 77c9a6d 25e3c44 b86781d 77517d8 b86781d a3ab355 77517d8 a3ab355 b86781d 77517d8 b86781d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
import pandas as pd
import numpy as np
import xgboost as xgb
import streamlit as st
import requests
from bs4 import BeautifulSoup
from gensim.models import FastText
import joblib
# Load the trained FastText model
fasttext_model = FastText.load('fasttext_model.bin')
except FileNotFoundError:
st.error("The FastText model file was not found. Please ensure 'fasttext_model.bin' and its associated files are in the correct directory.")
# Load the trained XGBoost model for the combined features
model = joblib.load('model.pkl')
except FileNotFoundError:
st.error("The XGBoost model file was not found. Please ensure 'model.pkl' is in the correct directory.")
def tokenize(text):
if isinstance(text, str):
return text.split()
return []
def embed_text(text_series, fasttext_model):
embeddings = []
for text in text_series:
tokens = tokenize(text)
vectors = [fasttext_model.wv[token] for token in tokens if token in fasttext_model.wv]
if vectors:
embeddings.append(np.mean(vectors, axis=0))
return np.array(embeddings)
def preprocess_input(query, title, description, url, fasttext_model):
query = str(query) if pd.notna(query) else ''
title = str(title) if pd.notna(title) else ''
description = str(description) if pd.notna(description) else ''
url = str(url) if pd.notna(url) else ''
query_ft = embed_text(pd.Series([query]), fasttext_model)
title_ft = embed_text(pd.Series([title]), fasttext_model)
description_ft = embed_text(pd.Series([description]), fasttext_model)
url_ft = embed_text(pd.Series([url]), fasttext_model)
combined_features = np.hstack([query_ft, title_ft, description_ft, url_ft])
dmatrix = xgb.DMatrix(combined_features)
return dmatrix
def extract_title_description(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36'
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
title = soup.title.string if soup.title else 'No title found'
description_tag = soup.find('meta', attrs={'name': 'description'})
description = description_tag['content'] if description_tag else 'No description found'
return title, description
except Exception as e:
return 'Error extracting title', 'Error extracting description'
def predict(query, title, description, url, fasttext_model):
dmatrix = preprocess_input(query, title, description, url, fasttext_model)
probability = model.predict(dmatrix, validate_features=False)[0]
binary_prediction = int(probability >= 0.5)
return binary_prediction, probability
# Streamlit interface
st.title('CTR Prediction Inference')
tab1, tab2, tab3 = st.tabs(["Single Entry", "Batch Entry", "A/B Test"])
with tab1:
st.header('Single Entry Inference')
query = st.text_input('Query')
url = st.text_input('URL')
if st.button('Predict'):
title, description = extract_title_description(url)
st.write(f'Extracted Title: {title}')
st.write(f'Extracted Description: {description}')
if query and url:
binary_result, confidence = predict(query, title, description, url, fasttext_model)
st.write(f'Predicted +/-: {binary_result}')
st.write(f'Conf.: {confidence:.2%}')
confidence_percentage = int(confidence * 100)
st.write('Please enter both a query and a URL.')
with tab2:
st.header('Batch Entry Inference')
uploaded_file = st.file_uploader("Upload CSV", type="csv")
if uploaded_file is not None:
df = pd.read_csv(uploaded_file)
required_columns = ['Query', 'Title', 'Description', 'URL']
if set(required_columns).issubset(df.columns):
predictions = []
confidences = []
for index, row in df.iterrows():
binary_result, confidence = predict(row['Query'], row['Title'], row['Description'], row['URL'], fasttext_model)
df['+/-'] = predictions
df['Conf.'] = [f"{conf:.2%}" for conf in confidences]
cols = ['+/-', 'Conf.'] + [col for col in df.columns if col not in ['+/-', 'Conf.']]
df = df[cols]
st.download_button("Download Predictions", df.to_csv(index=False), "predictions.csv")
st.write('CSV must contain Query, Title, Description, and URL columns.')
with tab3:
st.header('A/B Test Inference')
query = st.text_input('Query for A/B Test')
url = st.text_input('URL for A/B Test')
if 'step' not in st.session_state:
st.session_state.step = 0
if st.button('Scrape A/B'):
title_A, description_A = extract_title_description(url)
st.session_state['title_A'] = title_A
st.session_state['description_A'] = description_A
st.session_state.step = 1
if st.session_state.step == 1:
title_B = st.text_input('Title B', value=st.session_state.get('title_A', ''))
description_B = st.text_area('Description B', value=st.session_state.get('description_A', ''))
if st.button('Predict A/B'):
if query and url:
binary_result_A, confidence_A = predict(query, st.session_state['title_A'], st.session_state['description_A'], url, fasttext_model)
binary_result_B, confidence_B = predict(query, title_B, description_B, url, fasttext_model)
st.write(f'Results for A: Predicted +/-: {binary_result_A}, Conf.: {confidence_A:.2%}')
st.write(f'Results for B: Predicted +/-: {binary_result_B}, Conf.: {confidence_B:.2%}')
if binary_result_A == 1 and binary_result_B == 0:
st.write("B is worse than A")
elif binary_result_A == 0 and binary_result_B == 1:
st.write("B is better than A")
st.write("B is the same as A")