In [23]:
import hopsworks
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from dotenv import load_dotenv
import os

In [24]:
# Initialize Hopsworks connection
load_dotenv()

api_key = os.getenv("HOPSWORKS_API_KEY")
project = hopsworks.login(project="orestavf", api_key_value=api_key)
fs = project.get_feature_store()


2025-01-08 19:52:22,417 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-01-08 19:52:22,421 INFO: Initializing external client
2025-01-08 19:52:22,421 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-08 19:52:23,548 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1158296


In [3]:
# Load preprocessed data
feedback_fg = fs.get_feature_group(name="job_feedback", version=1)
feedback_df = feedback_fg.read()

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.84s) 


In [4]:
# Split into train and validation sets
train_df, val_df = train_test_split(feedback_df, test_size=0.2, random_state=42)

In [5]:
# Prepare data for SentenceTransformer
def prepare_examples(df):
    examples = []
    for _, row in df.iterrows():
        examples.append(
            InputExample(
                texts=[row["resume_text"], row["job_description"]],
                label=float(row["is_relevant"])  # Convert to float for loss calculation
            )
        )
    return examples

In [6]:
train_examples = prepare_examples(train_df)
val_examples = prepare_examples(val_df)

In [7]:
# Load pretrained SentenceTransformer
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

2025-01-08 19:25:05,476 INFO: Use pytorch device_name: cpu
2025-01-08 19:25:05,477 INFO: Load pretrained SentenceTransformer: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2


In [8]:
# Define DataLoader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
val_dataloader = DataLoader(val_examples, shuffle=False, batch_size=16)

In [9]:
# Define loss
train_loss = losses.CosineSimilarityLoss(model)

In [10]:
# Configure training
num_epochs = 3
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)  # 10% of training as warmup

In [11]:
# Train the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=None,  # Add an evaluator if needed
    epochs=num_epochs,
    warmup_steps=warmup_steps,
    output_path="./finetuned_model"
)

  0%|          | 0/3 [00:00<?, ?it/s]

{'train_runtime': 5.2094, 'train_samples_per_second': 2.879, 'train_steps_per_second': 0.576, 'train_loss': 0.27454523245493573, 'epoch': 3.0}
2025-01-08 19:25:14,162 INFO: Save model to ./finetuned_model


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [None]:
# Save the trained model locally
#model.save("./finetuned_model")
#print("Model finetuned and saved locally!")

In [12]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

In [13]:
# Define the Model Schema
X_train_sample = train_df[["resume_text", "job_description"]].sample(1).values  # Input example
y_train_sample = train_df["is_relevant"].sample(1).values  # Output example

In [14]:
input_schema = Schema(X_train_sample)
output_schema = Schema(y_train_sample)
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

In [15]:
# Get Model Registry
mr = project.get_model_registry()

In [19]:
# Register the model in the Model Registry
job_matching_model = mr.python.create_model(
    name="job_matching_sentence_transformer",
    #metrics=metrics,
    model_schema=model_schema,
    input_example=X_train_sample,
    description="Finetuned SentenceTransformer for job matching",
    version=1
)

In [20]:
# Save model artifacts to the Model Registry
job_matching_model.save("./finetuned_model")
print("Model registered in Hopsworks Model Registry!")

  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/727 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/212 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/470637416 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/242 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/21034 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/56 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/1015 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/17082987 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/1512 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/14763260 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/305 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/6678 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/216 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/1158296/models/job_matching_sentence_transformer/1
Model registered in Hopsworks Model Registry!


In [22]:
# Push the model to huggingface
model.push_to_hub("forestav/job_matching_sentence_transformer")

2025-01-08 19:44:05,458 INFO: Save model to C:\Users\Filip\AppData\Local\Temp\tmpa217ndkp


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

unigram.json:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

'https://huggingface.co/forestav/job_matching_sentence_transformer/commit/7168a70785fae3fee6f5576b40a7556072ba31a2'