submission-template

Sleeping

File size: 1,762 Bytes

cf772c6
 
 
04c9a68
7903e8c
 
15bc51b
258e407
 
cf772c6
cae897c
cf772c6
 
 
 
 
 
 
 
 
 
 
 
258e407
cf772c6
 
258e407
 
cf772c6
 
 
 
 
601d216
 
 
 
cae897c
 
 
 
c9878b6
258e407
601d216
cae897c
ced4359
cf772c6

import pickle
import re
import string
import pandas as pd
import sys
sys.path.append(".") 
from tasks.utils.preprocessing import process_text
import json
from sklearn.feature_extraction.text import TfidfVectorizer
    
def predict(input_df: pd.DataFrame, tfidf_path:str , tfidf_voc_path:str, tfidf_idf_path:str, model_path: str):
    """
    Predict the output using a saved TF-IDF vectorizer and Random Forest model.

    Parameters:
        input_df (pd.DataFrame): Input dataframe containing the text data.
        tfidf_path (str): Path to the saved TF-IDF vectorizer pickle file.
        model_path (str): Path to the saved Random Forest model pickle file.
        text_column (str): The name of the column in the dataframe containing the text data.

    Returns:
        pd.Series: Predictions for each row in the input dataframe.
    """
    
    # Load the TF-IDF vectorizer
    with open(tfidf_path, "rb") as tfidf_file:
        params = json.load(tfidf_file)
    

    # Load the Random Forest model
    with open(model_path, "rb") as model_file:
        model = pickle.load(model_file)

    # Load vocabulary
    with open(tfidf_voc_path, "rb") as f:
        vocab = pickle.load(f)

    # Load vocabulary
    with open(tfidf_idf_path, "rb") as f:
        idf = pickle.load(f)

    tfidf_vectorizer = TfidfVectorizer(**params)
    tfidf_vectorizer.set_params(preprocessor=process_text)
    tfidf_vectorizer.set_params(vocabulary=vocab)
    tfidf_vectorizer.idf_ = idf

    # Transform the input text using the TF-IDF vectorizer
    text_data = input_df.to_pandas()["quote"]
    text_features = tfidf_vectorizer.transform(text_data)

    # Make predictions using the loaded model
    predictions = model.predict(text_features)

    return predictions