File size: 2,231 Bytes
21a6064
3b25244
 
 
 
21a6064
ec88281
dd2fdf9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import streamlit as st
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import numpy as np
from ast import literal_eval


model_choice = "Embedder-typosquat-detect-Canine"

@st.cache_resource
def load_model() -> SentenceTransformer:
    return SentenceTransformer(f"./{model_choice}")

st.title("Search for the target of typosquat domains with our Domain Embedder")
st.markdown("This streamlit demonstrates how you can use our domain embedder to find the targets of typosquatted domains. "
        "Each domain is represented as an vector embedding that can be stored in a vector store for efficient retrieval. "
        "The domains you can search for in this application are the top 4k most popular domains, like `google.com`.  "
        "You can use the domain embedder to create a vector store specifically for the websites **you want to monitor**. "
        "This can include the services your company uses like Office365, or the websites of your company that may "
        "become spear phishing targets.")

model = load_model()


domains_df = pd.read_csv(f'./{model_choice}/domains_embs.csv')
domains_df.embedding = domains_df.embedding.apply(literal_eval)
corpus_domains = domains_df.domain.to_list()
corpus_embeddings = np.stack(domains_df.embedding.values).astype(np.float32)  # Ensure embeddings are float32

st.header("Enter a potential typosquatted domain and select the number of top results to retrieve. ")   
domain = st.text_input("Potential Typosquatted Domain")
top_k = st.number_input("Top K Results", min_value=1, max_value=50, value=5, step=1)

if st.button("Search for Legitimate Domains"):
    if domain:
        # Perform Semantic Search
        query_emb = model.encode(domain).astype(np.float32)  # Ensure query embedding is also float32
        semantic_res = util.semantic_search(query_emb, corpus_embeddings, top_k=top_k)[0]
        ids = [r['corpus_id'] for r in semantic_res]
        scores = [r['score'] for r in semantic_res]

        res_df = domains_df.loc[ids, ['domain']].copy()
        res_df['score'] = scores
        
        st.write("Mined Domains:")
        st.dataframe(res_df)
    else:
        st.warning("Please enter a domain to perform the search.")