iwashuman0405 commited on
Commit
8b8bcac
·
verified ·
1 Parent(s): 407f1e3

Upload 6 files

Browse files
Files changed (6) hide show
  1. .gitattributes +2 -35
  2. README.md +1 -12
  3. app.py +262 -0
  4. data.csv +0 -0
  5. embeddings.pth +3 -0
  6. requirements.txt +11 -0
.gitattributes CHANGED
@@ -1,35 +1,2 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ nomic_model/* filter=lfs diff=lfs merge=lfs -text
2
+ embeddings.pth filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,12 +1 @@
1
- ---
2
- title: RAG API
3
- emoji: 🏆
4
- colorFrom: yellow
5
- colorTo: yellow
6
- sdk: gradio
7
- sdk_version: 5.24.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ "# SHL Assessment Recommender"
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import pandas as pd
3
+ import numpy as np
4
+ import gradio as gr
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ from sentence_transformers import util, SentenceTransformer
7
+ import ast
8
+ import json
9
+ import re
10
+
11
+ # Load embeddings and data
12
+ embeddings = torch.load("embeddings.pth") # shape: [377, 768]
13
+ data_df = pd.read_csv("data.csv")
14
+
15
+ # Load model once
16
+ # model = SentenceTransformer("all-MiniLM-L6-v2")
17
+ model = SentenceTransformer("nomic-ai/nomic-embed-text-v1",trust_remote_code=True)
18
+
19
+ def extract_duration(text):
20
+ match = re.search(r"\d+", str(text)) # look for the first number
21
+ return int(match.group()) if match else 0
22
+
23
+ type_mapping = {
24
+ "A": "Ability & Aptitude",
25
+ "B": "Biodata & Situational Judgement",
26
+ "C": "Competencies",
27
+ "D": "Development & 360",
28
+ "E": "Assessment Exercises",
29
+ "K": "Knowledge & Skills",
30
+ "P": "Personality & Behavior",
31
+ "S": "Simulations"
32
+ }
33
+
34
+ def decode_test_types(test_type_raw):
35
+ try:
36
+ test_type_list = ast.literal_eval(test_type_raw)
37
+ return [type_mapping.get(code.strip(), code.strip()) for code in test_type_list]
38
+ except Exception:
39
+ return []
40
+
41
+
42
+ def clean_query_text(text):
43
+ replacements = {
44
+ "Java Script": "JavaScript",
45
+ "java script": "JavaScript",
46
+ "Java script": "JavaScript"
47
+ }
48
+ for wrong, correct in replacements.items():
49
+ text = text.replace(wrong, correct)
50
+ return text
51
+
52
+ def prepare_input(query):
53
+ cleaned_query = clean_query_text(query)
54
+ input_text = f"{cleaned_query}"
55
+ return input_text.strip()
56
+
57
+ def find_top_k(query: str, k: int = 5):
58
+ query_str = prepare_input(query)
59
+ query_vec = model.encode([query_str], normalize_embeddings=True)
60
+
61
+ scores = util.cos_sim(query_vec, embeddings)[0].numpy()
62
+ ranked_indices = np.argsort(-scores)
63
+
64
+ results = []
65
+ for idx in ranked_indices[:k]:
66
+ item = data_df.iloc[idx]
67
+ test_type_raw = item["test_types"]
68
+ test_type_decoded = decode_test_types(test_type_raw)
69
+
70
+ results.append({
71
+ "url": item["url"],
72
+ "adaptive_support": item["adaptive"],
73
+ "description": item["description"],
74
+ "duration": extract_duration(item["assessment_length"]),
75
+ "remote_support": item["remote"],
76
+ "test_type": test_type_decoded
77
+ })
78
+
79
+ # result = {
80
+ # "name": item["name"],
81
+ # "url": item["url"],
82
+ # "duration": item["assessment_length"],
83
+ # "remote": item["remote"],
84
+ # "adaptive": item["adaptive"]
85
+ # }
86
+ # results.append(result)
87
+
88
+ return results
89
+
90
+ def health():
91
+ return gr.JSON({"status": "healthy"})
92
+
93
+ def recommend(query):
94
+ recommended = find_top_k(query)
95
+ return gr.JSON({"recommended_assessments": recommended})
96
+
97
+ recommend_api = gr.Interface(fn=recommend, inputs=gr.Textbox(), outputs="json")
98
+ health_api = gr.Interface(fn=health, inputs=[], outputs="json")
99
+
100
+ # Gradio app with multiple endpoints
101
+ demo = gr.TabbedInterface(
102
+ interface_list=[recommend_api, health_api],
103
+ tab_names=["recommend", "health"]
104
+ )
105
+
106
+ if __name__ == "__main__":
107
+ demo.launch()
108
+ # Gradio Interface
109
+
110
+ # app = gr.Interface(
111
+ # fn=recommend,
112
+ # inputs=gr.Textbox(label="Job Description or Query"),
113
+ # outputs="json",
114
+ # examples=["Looking for java developer assessment", "Communication skills test"]
115
+ # )
116
+
117
+ # # Add `/health` route manually using FastAPI inside Gradio
118
+ # app.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True, inline=False)
119
+ # with gr.Blocks() as demo:
120
+ # gr.Markdown("### SHL Assessment Recommender")
121
+
122
+ # query_input = gr.Textbox(label="Job Query", placeholder="e.g. JavaScript Developer")
123
+ # duration_input = gr.Textbox(label="Assessment Duration (minutes)", placeholder="e.g. 30")
124
+ # topk_input = gr.Slider(label="Top K Results", minimum=1, maximum=10, step=1, value=5)
125
+ # output = gr.JSON(label="Top Matches")
126
+
127
+ # submit_btn = gr.Button("Submit")
128
+
129
+ # def process(query, duration, top_k):
130
+ # return find_top_k(query, duration, top_k)
131
+
132
+ # submit_btn.click(fn=process, inputs=[query_input, duration_input, topk_input], outputs=[output])
133
+
134
+ # demo.launch()
135
+
136
+ # def find_top_k(query_json, k=5):
137
+ # query_str = prepare_input(query_json)
138
+
139
+ # # Convert query to vector
140
+ # query_vec = model.encode([query_str], normalize_embeddings=True)
141
+
142
+ # # Cosine similarity with precomputed normalized embeddings
143
+ # scores = util.cos_sim(query_vec, embeddings)[0].numpy()
144
+ # ranked_indices = np.argsort(-scores)
145
+
146
+ # results = []
147
+ # for idx in ranked_indices[:k]:
148
+ # item = data_df.iloc[idx]
149
+ # result = {
150
+ # "name": item["name"],
151
+ # "url": item["url"],
152
+ # "remote_testing": item["remote"],
153
+ # "adaptive": item["adaptive"],
154
+ # "duration": item["assessment_length"],
155
+ # "test_type": item["test_types"],
156
+ # }
157
+ # results.append(result)
158
+
159
+ # return results
160
+
161
+ # # Gradio Interface
162
+ # with gr.Blocks() as demo:
163
+ # gr.Markdown("### RAG Gradio Demo with JSON Query")
164
+
165
+ # json_input = gr.Textbox(label="JSON Query (as JSON string)")
166
+ # output = gr.JSON(label="Top Matches from Data")
167
+
168
+ # def process(json_input_str):
169
+ # try:
170
+ # query_json = json.loads(json_input_str)
171
+ # results = find_top_k(query_json)
172
+ # return results
173
+ # except Exception as e:
174
+ # return {"error": str(e)}
175
+
176
+ # submit_btn = gr.Button("Submit")
177
+ # submit_btn.click(fn=process, inputs=[json_input], outputs=[output])
178
+
179
+ # demo.launch()
180
+
181
+ # import torch
182
+ # import pandas as pd
183
+ # import numpy as np
184
+ # import gradio as gr
185
+ # from sklearn.metrics.pairwise import cosine_similarity
186
+ # from sentence_transformers import util,SentenceTransformer
187
+
188
+ # # Load embeddings and data
189
+ # embeddings = torch.load("embeddings.pth") # shape: [377, 768]
190
+ # data_df = pd.read_csv("data.csv")
191
+
192
+ # def clean_query_text(text):
193
+ # replacements = {
194
+ # "Java Script": "JavaScript",
195
+ # "java script": "JavaScript",
196
+ # "Java script": "JavaScript"
197
+ # }
198
+ # for wrong, correct in replacements.items():
199
+ # text = text.replace(wrong, correct)
200
+ # return text
201
+
202
+ # def prepare_input(data):
203
+ # cleaned_query = clean_query_text(data.query)
204
+ # input_text = f"{cleaned_query}. Candidate should complete assessment in {data.duration} minutes."
205
+ # return input_text.strip()
206
+
207
+ # def find_top_k(query_json, k=5):
208
+ # query_str = prepare_input(query_json)
209
+ # # Convert query to vector
210
+ # from sentence_transformers import SentenceTransformer
211
+ # model = SentenceTransformer("all-MiniLM-L6-v2")
212
+ # query_vec = model.encode([query_str], normalize_embeddings=True)
213
+ # scores = util.cos_sim(query_vec, embeddings)[0].numpy()
214
+ # ranked_indices = np.argsort(-scores)
215
+
216
+ # results = []
217
+ # for idx in ranked_indices:
218
+ # item = data_df.iloc[idx]
219
+ # print(f"Matched: {item['name']} with duration {item['assessment_length']}")
220
+
221
+ # result = {
222
+ # "name": item["name"],
223
+ # "url": item["url"],
224
+ # "remote_testing": item["remote"],
225
+ # "adaptive": item["adaptive"],
226
+ # "duration": item['assessment_length'],
227
+ # "test_type": item["test_types"],
228
+ # }
229
+ # results.append(result)
230
+
231
+ # if len(results) >= top_k:
232
+ # break
233
+
234
+ # return results
235
+
236
+
237
+ # # Compute similarity
238
+ # # similarities = cosine_similarity(query_vec, embeddings.numpy())[0]
239
+ # # top_indices = similarities.argsort()[-k:][::-1]
240
+
241
+ # # results = data_df.iloc[top_indices].to_dict(orient="records")
242
+ # # return results
243
+
244
+ # with gr.Blocks() as demo:
245
+ # gr.Markdown("### RAG Gradio Demo with JSON Query")
246
+
247
+ # json_input = gr.Textbox(label="JSON Query (as string)")
248
+ # output = gr.JSON(label="Top Matches from Data")
249
+
250
+ # def process(json_input_str):
251
+ # try:
252
+ # import json
253
+ # query_json = json.loads(json_input_str)
254
+ # results = find_top_k(query_json)
255
+ # return results
256
+ # except Exception as e:
257
+ # return {"error": str(e)}
258
+
259
+ # submit_btn = gr.Button("Submit")
260
+ # submit_btn.click(fn=process, inputs=[json_input], outputs=[output])
261
+
262
+ # demo.launch()
data.csv ADDED
The diff for this file is too large to render. See raw diff
 
embeddings.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b17237d1f2eb8b8fa8765c2dd87f8b18ed27ef4844067fb9898ce330bd8e5f5
3
+ size 1732204
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas
2
+ numpy
3
+ sentence-transformers
4
+ torch
5
+ requests
6
+ transformers
7
+ einops
8
+ gradio
9
+ scikit-learn
10
+
11
+