Spaces:

forestav
/

jobsai

Running

File size: 4,532 Bytes

7b766ee

#!/usr/bin/env python3

import os
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses
# If you want to push to the HF Hub/Spaces programmatically:
#   pip install huggingface_hub
# from huggingface_hub import HfApi, HfFolder

def main():
    #--------------------------------------------------------------------------
    # 1. (Optional) Setup your Hugging Face auth
    #--------------------------------------------------------------------------
    # If you need to log into your HF account, you can do:
    #   hf_token = os.getenv("HF_TOKEN")  # or read from a config file
    #   HfFolder.save_token(hf_token)
    #   api = HfApi()
    #
    # Then set something like:
    #   repo_id = "KolumbusLindh/my-weekly-model"
    #
    # Alternatively, you can push manually later via huggingface-cli.

    #--------------------------------------------------------------------------
    # 2. Placeholder training data
    #--------------------------------------------------------------------------
    # Suppose each tuple is: (CV_text, liked_job_text, disliked_job_text).
    # In a real scenario, you'd gather user feedback from your database.
    train_data = [
        ("My CV #1", "Job #1 that user liked", "Job #1 that user disliked"),
        ("My CV #2", "Job #2 that user liked", "Job #2 that user disliked"),
        # ...
    ]

    #--------------------------------------------------------------------------
    # 3. Convert data into Sentence Transformers InputExamples
    #--------------------------------------------------------------------------
    train_examples = []
    for (cv_text, liked_job_text, disliked_job_text) in train_data:
        example = InputExample(
            texts=[cv_text, liked_job_text, disliked_job_text]
            # TripletLoss expects exactly 3 texts: anchor, positive, negative
        )
        train_examples.append(example)

    #--------------------------------------------------------------------------
    # 4. Load the base model
    #--------------------------------------------------------------------------
    model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
    model = SentenceTransformer(model_name)

    #--------------------------------------------------------------------------
    # 5. Prepare DataLoader & define the Triplet Loss
    #--------------------------------------------------------------------------
    # A typical margin is 0.5–1.0. Feel free to adjust it.
    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=8)
    train_loss = losses.TripletLoss(
        model=model,
        distance_metric=losses.TripletDistanceMetric.COSINE,
        margin=0.5
    )

    #--------------------------------------------------------------------------
    # 6. Fine-tune (fit) the model
    #--------------------------------------------------------------------------
    # Just 1 epoch here for demo. In practice, tune #epochs/batch_size, etc.
    num_epochs = 1
    warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)  # ~10% warmup

    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=num_epochs,
        warmup_steps=warmup_steps,
        show_progress_bar=True
    )

    #--------------------------------------------------------------------------
    # 7. Save model locally
    #--------------------------------------------------------------------------
    local_output_path = "my_finetuned_model"
    model.save(local_output_path)
    print(f"Model fine-tuned and saved locally to: {local_output_path}")

    #--------------------------------------------------------------------------
    # 8. (Optional) Push to your Hugging Face Space
    #--------------------------------------------------------------------------
    # If you want to push automatically:
    #
    #   model.push_to_hub(repo_id=repo_id, commit_message="Weekly model update")
    #
    # Or if you have a Space at e.g. https://huggingface.co/spaces/KolumbusLindh/<some-name>,
    # you’d create a repo on HF, then push to that repo. Typically one uses
    # huggingface-cli or the huggingface_hub methods for that:
    #
    #   api.create_repo(repo_id=repo_id, repo_type="model", private=False)
    #   model.push_to_hub(repo_id=repo_id)
    #
    #   # If it's a Space, you might need to store your model in the "models" folder
    #   # or however your Gradio app is set up to load it.

if __name__ == "__main__":
    main()