Spaces:
Sleeping
Sleeping
import torch | |
import pandas as pd | |
import numpy as np | |
import gradio as gr | |
from sklearn.metrics.pairwise import cosine_similarity | |
from sentence_transformers import util, SentenceTransformer | |
import ast | |
import json | |
import re | |
# Load embeddings and data | |
embeddings = torch.load("embeddings.pth",weights_only = False) # shape: [377, 768] | |
data_df = pd.read_csv("data.csv") | |
# Load model once | |
# model = SentenceTransformer("all-MiniLM-L6-v2") | |
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1",trust_remote_code=True) | |
def extract_duration(text): | |
match = re.search(r"\d+", str(text)) # look for the first number | |
return int(match.group()) if match else 0 | |
type_mapping = { | |
"A": "Ability & Aptitude", | |
"B": "Biodata & Situational Judgement", | |
"C": "Competencies", | |
"D": "Development & 360", | |
"E": "Assessment Exercises", | |
"K": "Knowledge & Skills", | |
"P": "Personality & Behavior", | |
"S": "Simulations" | |
} | |
def decode_test_types(test_type_raw): | |
try: | |
test_type_list = ast.literal_eval(test_type_raw) | |
return [type_mapping.get(code.strip(), code.strip()) for code in test_type_list] | |
except Exception: | |
return [] | |
def clean_query_text(text): | |
replacements = { | |
"Java Script": "JavaScript", | |
"java script": "JavaScript", | |
"Java script": "JavaScript" | |
} | |
for wrong, correct in replacements.items(): | |
text = text.replace(wrong, correct) | |
return text | |
def prepare_input(query): | |
cleaned_query = clean_query_text(query) | |
input_text = f"{cleaned_query}" | |
return input_text.strip() | |
def find_top_k(query: str, k: int = 5): | |
query_str = prepare_input(query) | |
query_vec = model.encode([query_str], normalize_embeddings=True) | |
scores = util.cos_sim(query_vec, embeddings)[0].numpy() | |
ranked_indices = np.argsort(-scores) | |
results = [] | |
for idx in ranked_indices[:k]: | |
item = data_df.iloc[idx] | |
test_type_raw = item["test_types"] | |
test_type_decoded = decode_test_types(test_type_raw) | |
results.append({ | |
"url": item["url"], | |
"adaptive_support": item["adaptive"], | |
"description": item["description"], | |
"duration": extract_duration(item["assessment_length"]), | |
"remote_support": item["remote"], | |
"test_type": test_type_decoded | |
}) | |
# result = { | |
# "name": item["name"], | |
# "url": item["url"], | |
# "duration": item["assessment_length"], | |
# "remote": item["remote"], | |
# "adaptive": item["adaptive"] | |
# } | |
# results.append(result) | |
return results | |
def health(): | |
return gr.JSON({"status": "healthy"}) | |
def recommend(query): | |
recommended = find_top_k(query) | |
return gr.JSON({"recommended_assessments": recommended}) | |
recommend_api = gr.Interface(fn=recommend, inputs=gr.Textbox(), outputs="json") | |
health_api = gr.Interface(fn=health, inputs=[], outputs="json") | |
# Gradio app with multiple endpoints | |
demo = gr.TabbedInterface( | |
interface_list=[recommend_api, health_api], | |
tab_names=["recommend", "health"] | |
) | |
if __name__ == "__main__": | |
demo.launch() | |
# Gradio Interface | |
# app = gr.Interface( | |
# fn=recommend, | |
# inputs=gr.Textbox(label="Job Description or Query"), | |
# outputs="json", | |
# examples=["Looking for java developer assessment", "Communication skills test"] | |
# ) | |
# # Add `/health` route manually using FastAPI inside Gradio | |
# app.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True, inline=False) | |
# with gr.Blocks() as demo: | |
# gr.Markdown("### SHL Assessment Recommender") | |
# query_input = gr.Textbox(label="Job Query", placeholder="e.g. JavaScript Developer") | |
# duration_input = gr.Textbox(label="Assessment Duration (minutes)", placeholder="e.g. 30") | |
# topk_input = gr.Slider(label="Top K Results", minimum=1, maximum=10, step=1, value=5) | |
# output = gr.JSON(label="Top Matches") | |
# submit_btn = gr.Button("Submit") | |
# def process(query, duration, top_k): | |
# return find_top_k(query, duration, top_k) | |
# submit_btn.click(fn=process, inputs=[query_input, duration_input, topk_input], outputs=[output]) | |
# demo.launch() | |
# def find_top_k(query_json, k=5): | |
# query_str = prepare_input(query_json) | |
# # Convert query to vector | |
# query_vec = model.encode([query_str], normalize_embeddings=True) | |
# # Cosine similarity with precomputed normalized embeddings | |
# scores = util.cos_sim(query_vec, embeddings)[0].numpy() | |
# ranked_indices = np.argsort(-scores) | |
# results = [] | |
# for idx in ranked_indices[:k]: | |
# item = data_df.iloc[idx] | |
# result = { | |
# "name": item["name"], | |
# "url": item["url"], | |
# "remote_testing": item["remote"], | |
# "adaptive": item["adaptive"], | |
# "duration": item["assessment_length"], | |
# "test_type": item["test_types"], | |
# } | |
# results.append(result) | |
# return results | |
# # Gradio Interface | |
# with gr.Blocks() as demo: | |
# gr.Markdown("### RAG Gradio Demo with JSON Query") | |
# json_input = gr.Textbox(label="JSON Query (as JSON string)") | |
# output = gr.JSON(label="Top Matches from Data") | |
# def process(json_input_str): | |
# try: | |
# query_json = json.loads(json_input_str) | |
# results = find_top_k(query_json) | |
# return results | |
# except Exception as e: | |
# return {"error": str(e)} | |
# submit_btn = gr.Button("Submit") | |
# submit_btn.click(fn=process, inputs=[json_input], outputs=[output]) | |
# demo.launch() | |
# import torch | |
# import pandas as pd | |
# import numpy as np | |
# import gradio as gr | |
# from sklearn.metrics.pairwise import cosine_similarity | |
# from sentence_transformers import util,SentenceTransformer | |
# # Load embeddings and data | |
# embeddings = torch.load("embeddings.pth") # shape: [377, 768] | |
# data_df = pd.read_csv("data.csv") | |
# def clean_query_text(text): | |
# replacements = { | |
# "Java Script": "JavaScript", | |
# "java script": "JavaScript", | |
# "Java script": "JavaScript" | |
# } | |
# for wrong, correct in replacements.items(): | |
# text = text.replace(wrong, correct) | |
# return text | |
# def prepare_input(data): | |
# cleaned_query = clean_query_text(data.query) | |
# input_text = f"{cleaned_query}. Candidate should complete assessment in {data.duration} minutes." | |
# return input_text.strip() | |
# def find_top_k(query_json, k=5): | |
# query_str = prepare_input(query_json) | |
# # Convert query to vector | |
# from sentence_transformers import SentenceTransformer | |
# model = SentenceTransformer("all-MiniLM-L6-v2") | |
# query_vec = model.encode([query_str], normalize_embeddings=True) | |
# scores = util.cos_sim(query_vec, embeddings)[0].numpy() | |
# ranked_indices = np.argsort(-scores) | |
# results = [] | |
# for idx in ranked_indices: | |
# item = data_df.iloc[idx] | |
# print(f"Matched: {item['name']} with duration {item['assessment_length']}") | |
# result = { | |
# "name": item["name"], | |
# "url": item["url"], | |
# "remote_testing": item["remote"], | |
# "adaptive": item["adaptive"], | |
# "duration": item['assessment_length'], | |
# "test_type": item["test_types"], | |
# } | |
# results.append(result) | |
# if len(results) >= top_k: | |
# break | |
# return results | |
# # Compute similarity | |
# # similarities = cosine_similarity(query_vec, embeddings.numpy())[0] | |
# # top_indices = similarities.argsort()[-k:][::-1] | |
# # results = data_df.iloc[top_indices].to_dict(orient="records") | |
# # return results | |
# with gr.Blocks() as demo: | |
# gr.Markdown("### RAG Gradio Demo with JSON Query") | |
# json_input = gr.Textbox(label="JSON Query (as string)") | |
# output = gr.JSON(label="Top Matches from Data") | |
# def process(json_input_str): | |
# try: | |
# import json | |
# query_json = json.loads(json_input_str) | |
# results = find_top_k(query_json) | |
# return results | |
# except Exception as e: | |
# return {"error": str(e)} | |
# submit_btn = gr.Button("Submit") | |
# submit_btn.click(fn=process, inputs=[json_input], outputs=[output]) | |
# demo.launch() | |