chgrdj commited on
Commit
a73e92a
·
verified ·
1 Parent(s): d9c0423

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -32
app.py CHANGED
@@ -4,39 +4,42 @@ import pandas as pd
4
  import numpy as np
5
  from ast import literal_eval
6
 
7
- # Load the model
8
- model_name = "./Embedder-Typosquat"
9
- model = SentenceTransformer(model_name)
10
 
11
- # Load the domains and embeddings
12
- domains_df = pd.read_csv('domains_embs.csv')
13
- domains_df.embedding = domains_df.embedding.apply(literal_eval)
14
- corpus_domains = domains_df.domain.to_list()
15
- corpus_embeddings = np.stack(domains_df.embedding.values).astype(np.float32) # Ensure embeddings are float32
16
 
17
- # Streamlit App
18
- st.title("Mining Potential Legitimate Domains from a Typosquatted Domain")
19
- st.write("Enter a potential typosquatted domain and select the number of top results to retrieve.")
 
 
20
 
21
- # User Inputs
22
- domain = st.text_input("Potential Typosquatted Domain")
23
- top_k = st.number_input("Top K Results", min_value=1, max_value=50, value=5, step=1)
24
 
25
- # Button to trigger search
26
- if st.button("Search for Legitimate Domains"):
27
- if domain:
28
- # Perform Semantic Search
29
- query_emb = model.encode(domain).astype(np.float32) # Ensure query embedding is also float32
30
- semantic_res = util.semantic_search(query_emb, corpus_embeddings, top_k=top_k)[0]
31
- ids = [r['corpus_id'] for r in semantic_res]
32
- scores = [r['score'] for r in semantic_res]
33
-
34
- # Create a DataFrame for the results
35
- res_df = domains_df.loc[ids,['domain']].copy()
36
- res_df['score'] = scores
37
-
38
- # Display the result DataFrame
39
- st.write("Mined Domains:")
40
- st.dataframe(res_df)
41
- else:
42
- st.warning("Please enter a domain to perform the search.")
 
 
 
 
 
4
  import numpy as np
5
  from ast import literal_eval
6
 
7
+ # Dropdown to select the model
8
+ model_choice = st.selectbox("Select the embedding model:", ["", "Embedder-typosquat-detect-Canine", "Embedder-typosquat-detect"], index=0)
 
9
 
10
+ # Load the model only if a model is selected
11
+ if model_choice:
12
+ model = SentenceTransformer(f"./{model_choice}")
 
 
13
 
14
+ # Load the domains and embeddings
15
+ domains_df = pd.read_csv(f'./{model_choice}/domains_embs.csv')
16
+ domains_df.embedding = domains_df.embedding.apply(literal_eval)
17
+ corpus_domains = domains_df.domain.to_list()
18
+ corpus_embeddings = np.stack(domains_df.embedding.values).astype(np.float32) # Ensure embeddings are float32
19
 
20
+ # Streamlit App
21
+ st.title("Mining Potential Legitimate Domains from a Typosquatted Domain")
22
+ st.write("Enter a potential typosquatted domain and select the number of top results to retrieve.")
23
 
24
+ # User Inputs
25
+ domain = st.text_input("Potential Typosquatted Domain")
26
+ top_k = st.number_input("Top K Results", min_value=1, max_value=50, value=5, step=1)
27
+
28
+ # Button to trigger search
29
+ if st.button("Search for Legitimate Domains"):
30
+ if domain:
31
+ # Perform Semantic Search
32
+ query_emb = model.encode(domain).astype(np.float32) # Ensure query embedding is also float32
33
+ semantic_res = util.semantic_search(query_emb, corpus_embeddings, top_k=top_k)[0]
34
+ ids = [r['corpus_id'] for r in semantic_res]
35
+ scores = [r['score'] for r in semantic_res]
36
+
37
+ # Create a DataFrame for the results
38
+ res_df = domains_df.loc[ids, ['domain']].copy()
39
+ res_df['score'] = scores
40
+
41
+ # Display the result DataFrame
42
+ st.write("Mined Domains:")
43
+ st.dataframe(res_df)
44
+ else:
45
+ st.warning("Please enter a domain to perform the search.")