import torch import pandas as pd import numpy as np import gradio as gr from sklearn.metrics.pairwise import cosine_similarity from sentence_transformers import util, SentenceTransformer import ast import json import re # Load embeddings and data embeddings = torch.load("embeddings.pth",weights_only = False) # shape: [377, 768] data_df = pd.read_csv("data.csv") # Load model once # model = SentenceTransformer("all-MiniLM-L6-v2") model = SentenceTransformer("nomic-ai/nomic-embed-text-v1",trust_remote_code=True) def extract_duration(text): match = re.search(r"\d+", str(text)) # look for the first number return int(match.group()) if match else 0 type_mapping = { "A": "Ability & Aptitude", "B": "Biodata & Situational Judgement", "C": "Competencies", "D": "Development & 360", "E": "Assessment Exercises", "K": "Knowledge & Skills", "P": "Personality & Behavior", "S": "Simulations" } def decode_test_types(test_type_raw): try: test_type_list = ast.literal_eval(test_type_raw) return [type_mapping.get(code.strip(), code.strip()) for code in test_type_list] except Exception: return [] def clean_query_text(text): replacements = { "Java Script": "JavaScript", "java script": "JavaScript", "Java script": "JavaScript" } for wrong, correct in replacements.items(): text = text.replace(wrong, correct) return text def prepare_input(query): cleaned_query = clean_query_text(query) input_text = f"{cleaned_query}" return input_text.strip() def find_top_k(query: str, k: int = 5): query_str = prepare_input(query) query_vec = model.encode([query_str], normalize_embeddings=True) scores = util.cos_sim(query_vec, embeddings)[0].numpy() ranked_indices = np.argsort(-scores) results = [] for idx in ranked_indices[:k]: item = data_df.iloc[idx] test_type_raw = item["test_types"] test_type_decoded = decode_test_types(test_type_raw) results.append({ "url": item["url"], "adaptive_support": item["adaptive"], "description": item["description"], "duration": extract_duration(item["assessment_length"]), "remote_support": item["remote"], "test_type": test_type_decoded }) # result = { # "name": item["name"], # "url": item["url"], # "duration": item["assessment_length"], # "remote": item["remote"], # "adaptive": item["adaptive"] # } # results.append(result) return results def health(): return gr.JSON({"status": "healthy"}) def recommend(query): recommended = find_top_k(query) return gr.JSON({"recommended_assessments": recommended}) recommend_api = gr.Interface(fn=recommend, inputs=gr.Textbox(), outputs="json") health_api = gr.Interface(fn=health, inputs=[], outputs="json") # Gradio app with multiple endpoints demo = gr.TabbedInterface( interface_list=[recommend_api, health_api], tab_names=["recommend", "health"] ) if __name__ == "__main__": demo.launch() # Gradio Interface # app = gr.Interface( # fn=recommend, # inputs=gr.Textbox(label="Job Description or Query"), # outputs="json", # examples=["Looking for java developer assessment", "Communication skills test"] # ) # # Add `/health` route manually using FastAPI inside Gradio # app.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True, inline=False) # with gr.Blocks() as demo: # gr.Markdown("### SHL Assessment Recommender") # query_input = gr.Textbox(label="Job Query", placeholder="e.g. JavaScript Developer") # duration_input = gr.Textbox(label="Assessment Duration (minutes)", placeholder="e.g. 30") # topk_input = gr.Slider(label="Top K Results", minimum=1, maximum=10, step=1, value=5) # output = gr.JSON(label="Top Matches") # submit_btn = gr.Button("Submit") # def process(query, duration, top_k): # return find_top_k(query, duration, top_k) # submit_btn.click(fn=process, inputs=[query_input, duration_input, topk_input], outputs=[output]) # demo.launch() # def find_top_k(query_json, k=5): # query_str = prepare_input(query_json) # # Convert query to vector # query_vec = model.encode([query_str], normalize_embeddings=True) # # Cosine similarity with precomputed normalized embeddings # scores = util.cos_sim(query_vec, embeddings)[0].numpy() # ranked_indices = np.argsort(-scores) # results = [] # for idx in ranked_indices[:k]: # item = data_df.iloc[idx] # result = { # "name": item["name"], # "url": item["url"], # "remote_testing": item["remote"], # "adaptive": item["adaptive"], # "duration": item["assessment_length"], # "test_type": item["test_types"], # } # results.append(result) # return results # # Gradio Interface # with gr.Blocks() as demo: # gr.Markdown("### RAG Gradio Demo with JSON Query") # json_input = gr.Textbox(label="JSON Query (as JSON string)") # output = gr.JSON(label="Top Matches from Data") # def process(json_input_str): # try: # query_json = json.loads(json_input_str) # results = find_top_k(query_json) # return results # except Exception as e: # return {"error": str(e)} # submit_btn = gr.Button("Submit") # submit_btn.click(fn=process, inputs=[json_input], outputs=[output]) # demo.launch() # import torch # import pandas as pd # import numpy as np # import gradio as gr # from sklearn.metrics.pairwise import cosine_similarity # from sentence_transformers import util,SentenceTransformer # # Load embeddings and data # embeddings = torch.load("embeddings.pth") # shape: [377, 768] # data_df = pd.read_csv("data.csv") # def clean_query_text(text): # replacements = { # "Java Script": "JavaScript", # "java script": "JavaScript", # "Java script": "JavaScript" # } # for wrong, correct in replacements.items(): # text = text.replace(wrong, correct) # return text # def prepare_input(data): # cleaned_query = clean_query_text(data.query) # input_text = f"{cleaned_query}. Candidate should complete assessment in {data.duration} minutes." # return input_text.strip() # def find_top_k(query_json, k=5): # query_str = prepare_input(query_json) # # Convert query to vector # from sentence_transformers import SentenceTransformer # model = SentenceTransformer("all-MiniLM-L6-v2") # query_vec = model.encode([query_str], normalize_embeddings=True) # scores = util.cos_sim(query_vec, embeddings)[0].numpy() # ranked_indices = np.argsort(-scores) # results = [] # for idx in ranked_indices: # item = data_df.iloc[idx] # print(f"Matched: {item['name']} with duration {item['assessment_length']}") # result = { # "name": item["name"], # "url": item["url"], # "remote_testing": item["remote"], # "adaptive": item["adaptive"], # "duration": item['assessment_length'], # "test_type": item["test_types"], # } # results.append(result) # if len(results) >= top_k: # break # return results # # Compute similarity # # similarities = cosine_similarity(query_vec, embeddings.numpy())[0] # # top_indices = similarities.argsort()[-k:][::-1] # # results = data_df.iloc[top_indices].to_dict(orient="records") # # return results # with gr.Blocks() as demo: # gr.Markdown("### RAG Gradio Demo with JSON Query") # json_input = gr.Textbox(label="JSON Query (as string)") # output = gr.JSON(label="Top Matches from Data") # def process(json_input_str): # try: # import json # query_json = json.loads(json_input_str) # results = find_top_k(query_json) # return results # except Exception as e: # return {"error": str(e)} # submit_btn = gr.Button("Submit") # submit_btn.click(fn=process, inputs=[json_input], outputs=[output]) # demo.launch()