|
import pickle |
|
import numpy as np |
|
import pandas as pd |
|
|
|
import nltk |
|
from nltk.stem import * |
|
nltk.download("punkt_tab") |
|
|
|
FILE_PATH = "/app/src/ressources/technologies_database.xlsx" |
|
|
|
def set_prompt(problem): |
|
prompt = """ |
|
# ROLE |
|
|
|
You are a meticulous senior technical analyst and constraints scout. Your task is to read a small description of a technical problem and identify distinct constraints each related to the problem and ensuring that the whole problem is encompassed by each constraints into a JSON object. |
|
|
|
# OBJECTIVE |
|
|
|
Find all the constraints in this technical problem making sure each are premised on the problem only. |
|
Take into account different technical domains to encompass the whole problem. |
|
Output each constraints in a JSON such as : {"title of the constraints1":"description1","title of the constraintsN":"descriptionN"} |
|
|
|
# INSTRUCTIONS & RULES |
|
|
|
1. **JSON Output**: Your entire response MUST be a single JSON code block starting with a hyphen (`-`) to denote a list. Do not include any explanatory text before or after the JSON. |
|
2 **Discover and Iterate**: Your primary task is to scan the technical problem, find each constraints and create a seperate entry for it in the output JSON. |
|
3. **Descriptive Sentences**: You MUST write clear, full sentences that describe the constraints's issues. Do not use single keywords. These descriptions should be based on the information in the technical problem. |
|
4. **Infer Where Necessary**: The technical problem may not contain all details. Infer plausible information based on the context. |
|
|
|
# JSON SCHEMA & EXAMPLE |
|
|
|
{ |
|
'Exposing Compute Resources': 'The 6G network shall provide suitable APIs to allow authorized third parties and/or UEs to retrieve availability information about computational resources inside the Service Hosting Environment (SHE) and to utilize these computational resources for running workloads on demand.', |
|
'Providing AI Compute': 'The 6G network shall be able to provide computing resources in the Service Hosting Environment for AI services and provide AI services to UEs.', |
|
... |
|
} |
|
|
|
--- |
|
***NOW, BEGIN THE TASK.*** |
|
|
|
# TECHNICAL PROBLEM |
|
|
|
""" + problem |
|
return prompt |
|
|
|
def load_technologies_excel(): |
|
df = pd.read_excel(FILE_PATH) |
|
return df |
|
|
|
def load_technologies(): |
|
EMBEDDINGS_FILE = '/app/src/ressources/global_tech_embeddings.pkl' |
|
|
|
try: |
|
with open(EMBEDDINGS_FILE, 'rb') as f: |
|
loaded_data = pickle.load(f) |
|
global_tech = loaded_data['global_tech'] |
|
global_tech_embedding = loaded_data['global_tech_embeddings'] |
|
return global_tech, global_tech_embedding |
|
except Exception as e: |
|
print(f"Error: {e}") |
|
|
|
def tech_to_dict(technologies): |
|
tech_dict = [] |
|
for index, tech in enumerate(technologies): |
|
if not tech.find("<title>") > 1: |
|
tab = tech.split("\n") |
|
tab.pop(0) |
|
tab.pop(len(tab)-1) |
|
tech_dict.append({"title": tab[0][tab[0].find(": ")+2:], |
|
"purpose": tab[1][tab[1].find(": ")+2:], |
|
"key_components": tab[2][tab[2].find(": ")+2:], |
|
"advantages": tab[3][tab[3].find(": ")+2:], |
|
"limitations": tab[4][tab[4].find(": ")+2:], |
|
"id": index}) |
|
return tech_dict |
|
|
|
def save_dataframe(df, title): |
|
pd.DataFrame(df).to_excel(title) |
|
return title |
|
|
|
def stem(data,data_type): |
|
stemmer = SnowballStemmer("english") |
|
processed_data = [] |
|
if data_type == "technologies": |
|
for t_item in data: |
|
processed_data.append({ |
|
"title": stemmer.stem(t_item["title"]), |
|
"purpose": stemmer.stem(t_item["purpose"]), |
|
"key_components": stemmer.stem(t_item["key_components"]), |
|
"advantages": stemmer.stem(t_item["advantages"]), |
|
"limitations": stemmer.stem(t_item["limitations"]), |
|
"id": t_item["id"] |
|
}) |
|
else: |
|
for t_item in data: |
|
print(t_item) |
|
processed_data.append({ |
|
"title": stemmer.stem(t_item), |
|
"description": stemmer.stem(data[t_item]) |
|
}) |
|
|
|
return processed_data |
|
|
|
|
|
def get_technologies_by_id(id_list, technologies): |
|
result = [] |
|
id_set = set(id_list) |
|
for tech in technologies: |
|
if tech.get('id') in id_set: |
|
result.append(tech) |
|
return result |
|
|
|
def save_to_pickle(result_similarites): |
|
|
|
constraint_titles = sorted(list(set([item['constraint']['title'] for item in result_similarites]))) |
|
max_id2 = max([item['id2'] for item in result_similarites]) |
|
|
|
row_label_to_index = {title: i for i, title in enumerate(constraint_titles)} |
|
col_labels = list(range(1, max_id2 + 1)) |
|
|
|
num_rows = len(constraint_titles) |
|
num_cols = max_id2 |
|
|
|
matrix = np.full((num_rows, num_cols), np.nan, dtype=np.float32) |
|
|
|
for item in result_similarites: |
|
row_idx = row_label_to_index[item['constraint']['title']] |
|
col_idx = item['id2'] - 1 |
|
similarity_value = item['similarity'].item() |
|
|
|
matrix[row_idx, col_idx] = similarity_value |
|
|
|
print(f"Successfully created matrix with shape: {matrix.shape}") |
|
print(f"Number of rows (unique constraints): {num_rows}") |
|
print(f"Number of columns (max id2): {num_cols}") |
|
print("\nExample 5x5 block of the created matrix (NaN for missing values):") |
|
print(matrix[:5, :5]) |
|
|
|
output_filename = "cosine_similarity_matrix_with_labels.pkl" |
|
data_to_save = { |
|
'matrix': matrix, |
|
'row_labels': constraint_titles, |
|
'col_labels': col_labels |
|
} |
|
|
|
with open(output_filename, 'wb') as f: |
|
pickle.dump(data_to_save, f) |
|
|
|
print(f"\nMatrix and labels saved to {output_filename}") |
|
return output_filename |