Update app.py
Browse files
app.py
CHANGED
@@ -6,12 +6,21 @@ import numpy as np
|
|
6 |
from io import StringIO
|
7 |
import sys
|
8 |
import time
|
|
|
9 |
|
10 |
# File Imports
|
11 |
from embedding import get_embeddings # Ensure this file/module is available
|
12 |
from preprocess import filtering # Ensure this file/module is available
|
13 |
from search import *
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
# Cosine Similarity Function
|
16 |
def cosine_similarity(vec1, vec2):
|
17 |
vec1 = np.array(vec1)
|
@@ -42,16 +51,21 @@ class StreamCapture:
|
|
42 |
|
43 |
# Main Function
|
44 |
def score(main_product, main_url, product_count, link_count, search, logger, log_area):
|
|
|
|
|
|
|
45 |
data = {}
|
46 |
similar_products = extract_similar_products(main_product)[:product_count]
|
|
|
47 |
|
|
|
48 |
if search == 'All':
|
49 |
|
50 |
def process_product(product, search_function, main_product):
|
51 |
search_result = search_function(product)
|
52 |
return filtering(search_result, main_product, product, link_count)
|
53 |
-
|
54 |
-
|
55 |
search_functions = {
|
56 |
'google': search_google,
|
57 |
'duckduckgo': search_duckduckgo,
|
@@ -91,16 +105,47 @@ def score(main_product, main_url, product_count, link_count, search, logger, log
|
|
91 |
elif search == 'wikipedia':
|
92 |
data[product] = filtering(search_wikipedia(product), main_product, product, link_count)
|
93 |
|
|
|
|
|
94 |
logger.write("\n\nFiltered Links ------------------>\n")
|
95 |
logger.write(str(data) + "\n")
|
96 |
log_area.text(logger.getvalue())
|
97 |
|
|
|
|
|
|
|
98 |
logger.write("\n\nCreating Main product Embeddings ---------->\n")
|
99 |
-
main_result, main_embedding = get_embeddings(main_url,tag_option)
|
100 |
-
log_area.text(logger.getvalue())
|
101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
print("main",main_embedding)
|
103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
cosine_sim_scores = []
|
105 |
|
106 |
logger.write("\n\nCreating Similar product Embeddings ---------->\n")
|
@@ -116,9 +161,15 @@ def score(main_product, main_url, product_count, link_count, search, logger, log
|
|
116 |
cosine_sim_scores.append((product,'No Product links Found Increase Number of Links or Change Search Source',None,None))
|
117 |
|
118 |
else:
|
119 |
-
for link in data[product][:link_count]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
|
121 |
-
similar_result, similar_embedding = get_embeddings(link,tag_option)
|
122 |
log_area.text(logger.getvalue())
|
123 |
|
124 |
print(similar_embedding)
|
@@ -126,10 +177,24 @@ def score(main_product, main_url, product_count, link_count, search, logger, log
|
|
126 |
score = cosine_similarity(main_embedding[i], similar_embedding[i])
|
127 |
cosine_sim_scores.append((product, link, i, score))
|
128 |
log_area.text(logger.getvalue())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
|
130 |
logger.write("--------------- DONE -----------------\n")
|
131 |
log_area.text(logger.getvalue())
|
132 |
-
return cosine_sim_scores
|
133 |
|
134 |
# Streamlit Interface
|
135 |
st.title("Check Infringement")
|
@@ -155,7 +220,7 @@ if st.button('Check for Infringement'):
|
|
155 |
|
156 |
with st.spinner('Processing...'):
|
157 |
with StreamCapture() as logger:
|
158 |
-
cosine_sim_scores
|
159 |
|
160 |
st.success('Processing complete!')
|
161 |
|
|
|
6 |
from io import StringIO
|
7 |
import sys
|
8 |
import time
|
9 |
+
from pymongo import MongoClient
|
10 |
|
11 |
# File Imports
|
12 |
from embedding import get_embeddings # Ensure this file/module is available
|
13 |
from preprocess import filtering # Ensure this file/module is available
|
14 |
from search import *
|
15 |
|
16 |
+
|
17 |
+
# Mongo Connections
|
18 |
+
srv_connection_uri = "mongodb+srv://adityasm1410:[email protected]/?retryWrites=true&w=majority&appName=Patseer"
|
19 |
+
|
20 |
+
client = MongoClient(srv_connection_uri)
|
21 |
+
db = client['embeddings']
|
22 |
+
collection = db['data']
|
23 |
+
|
24 |
# Cosine Similarity Function
|
25 |
def cosine_similarity(vec1, vec2):
|
26 |
vec1 = np.array(vec1)
|
|
|
51 |
|
52 |
# Main Function
|
53 |
def score(main_product, main_url, product_count, link_count, search, logger, log_area):
|
54 |
+
|
55 |
+
existing_products_urls = set(collection.distinct('url'))
|
56 |
+
|
57 |
data = {}
|
58 |
similar_products = extract_similar_products(main_product)[:product_count]
|
59 |
+
|
60 |
|
61 |
+
# Normal Filtering + Embedding -----------------------------------------------
|
62 |
if search == 'All':
|
63 |
|
64 |
def process_product(product, search_function, main_product):
|
65 |
search_result = search_function(product)
|
66 |
return filtering(search_result, main_product, product, link_count)
|
67 |
+
|
68 |
+
|
69 |
search_functions = {
|
70 |
'google': search_google,
|
71 |
'duckduckgo': search_duckduckgo,
|
|
|
105 |
elif search == 'wikipedia':
|
106 |
data[product] = filtering(search_wikipedia(product), main_product, product, link_count)
|
107 |
|
108 |
+
|
109 |
+
# Filtered Link -----------------------------------------
|
110 |
logger.write("\n\nFiltered Links ------------------>\n")
|
111 |
logger.write(str(data) + "\n")
|
112 |
log_area.text(logger.getvalue())
|
113 |
|
114 |
+
|
115 |
+
|
116 |
+
# Main product Embeddings ---------------------------------
|
117 |
logger.write("\n\nCreating Main product Embeddings ---------->\n")
|
|
|
|
|
118 |
|
119 |
+
# Check main product in MongoDB
|
120 |
+
if main_url in existing_products_urls:
|
121 |
+
saved_data = collection.find_one({'url': main_url})
|
122 |
+
|
123 |
+
if tag_option not in saved_data:
|
124 |
+
main_result , main_embedding = get_embeddings(main_url,tag_option)
|
125 |
+
else:
|
126 |
+
main_embedding = saved_data[tag_option]
|
127 |
+
else:
|
128 |
+
main_result , main_embedding = get_embeddings(main_url,tag_option)
|
129 |
+
|
130 |
+
log_area.text(logger.getvalue())
|
131 |
print("main",main_embedding)
|
132 |
+
|
133 |
+
update_doc = {
|
134 |
+
'$set': {
|
135 |
+
'product_name': main_product,
|
136 |
+
'url': main_url,
|
137 |
+
tag_option: main_embedding
|
138 |
+
}
|
139 |
+
}
|
140 |
+
|
141 |
+
collection.update_one(
|
142 |
+
{'url': main_url},
|
143 |
+
update_doc,
|
144 |
+
upsert=True
|
145 |
+
)
|
146 |
+
|
147 |
+
|
148 |
+
#Similar Products Check
|
149 |
cosine_sim_scores = []
|
150 |
|
151 |
logger.write("\n\nCreating Similar product Embeddings ---------->\n")
|
|
|
161 |
cosine_sim_scores.append((product,'No Product links Found Increase Number of Links or Change Search Source',None,None))
|
162 |
|
163 |
else:
|
164 |
+
for link,present in data[product][:link_count]:
|
165 |
+
|
166 |
+
saved_data = collection.find_one({'url': link})
|
167 |
+
|
168 |
+
if present and (tag_option in saved_data):
|
169 |
+
similar_embedding = saved_data[tag_option]
|
170 |
+
else:
|
171 |
+
similar_result, similar_embedding = get_embeddings(link,tag_option)
|
172 |
|
|
|
173 |
log_area.text(logger.getvalue())
|
174 |
|
175 |
print(similar_embedding)
|
|
|
177 |
score = cosine_similarity(main_embedding[i], similar_embedding[i])
|
178 |
cosine_sim_scores.append((product, link, i, score))
|
179 |
log_area.text(logger.getvalue())
|
180 |
+
|
181 |
+
update_doc = {
|
182 |
+
'$set': {
|
183 |
+
'product_name': product,
|
184 |
+
'url': link,
|
185 |
+
tag_option: similar_embedding
|
186 |
+
}
|
187 |
+
}
|
188 |
+
|
189 |
+
collection.update_one(
|
190 |
+
{'url': link},
|
191 |
+
update_doc,
|
192 |
+
upsert=True
|
193 |
+
)
|
194 |
|
195 |
logger.write("--------------- DONE -----------------\n")
|
196 |
log_area.text(logger.getvalue())
|
197 |
+
return cosine_sim_scores
|
198 |
|
199 |
# Streamlit Interface
|
200 |
st.title("Check Infringement")
|
|
|
220 |
|
221 |
with st.spinner('Processing...'):
|
222 |
with StreamCapture() as logger:
|
223 |
+
cosine_sim_scores = score(main_product, main_url,product_count, link_count, search_method, logger, log_output)
|
224 |
|
225 |
st.success('Processing complete!')
|
226 |
|