cpv_3.1_eval_pipeline

Sleeping

App Files Files Community

cpv_3.1_eval_pipeline / utils /target_classifier.py

leavoigt

Update utils/target_classifier.py

3efd370 almost 2 years ago

raw

history blame

4.97 kB

	# from typing import List, Tuple
	# from typing_extensions import Literal
	# import logging
	# import pandas as pd
	# from pandas import DataFrame, Series
	# from utils.config import getconfig
	# from utils.preprocessing import processingpipeline
	# import streamlit as st
	# from transformers import pipeline

	# ## Labels dictionary ###
	# _lab_dict = {
	# '0':'NO',
	# '1':'YES',
	# }

	# def get_target_labels(preds):

	# """
	# Function that takes the numerical predictions as an input and returns a list of the labels.

	# """

	# # Get label names
	# preds_list = preds.tolist()

	# predictions_names=[]

	# # loop through each prediction
	# for ele in preds_list:

	# # see if there is a value 1 and retrieve index
	# try:
	# index_of_one = ele.index(1)
	# except ValueError:
	# index_of_one = "NA"

	# # Retrieve the name of the label (if no prediction made = NA)
	# if index_of_one != "NA":
	# name = label_dict[index_of_one]
	# else:
	# name = "Other"

	# # Append name to list
	# predictions_names.append(name)

	# return predictions_names

	# @st.cache_resource
	# def load_targetClassifier(config_file:str = None, classifier_name:str = None):
	# """
	# loads the document classifier using haystack, where the name/path of model
	# in HF-hub as string is used to fetch the model object.Either configfile or
	# model should be passed.
	# 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
	# 2. https://docs.haystack.deepset.ai/docs/document_classifier
	# Params
	# --------
	# config_file: config file path from which to read the model name
	# classifier_name: if modelname is passed, it takes a priority if not \
	# found then will look for configfile, else raise error.
	# Return: document classifier model
	# """
	# if not classifier_name:
	# if not config_file:
	# logging.warning("Pass either model name or config file")
	# return
	# else:
	# config = getconfig(config_file)
	# classifier_name = config.get('target','MODEL')

	# logging.info("Loading classifier")

	# doc_classifier = pipeline("text-classification",
	# model=classifier_name,
	# top_k =1)

	# return doc_classifier


	# @st.cache_data
	# def target_classification(haystack_doc:pd.DataFrame,
	# threshold:float = 0.5,
	# classifier_model:pipeline= None
	# )->Tuple[DataFrame,Series]:
	# """
	# Text-Classification on the list of texts provided. Classifier provides the
	# most appropriate label for each text. There labels indicate whether the paragraph
	# references a specific action, target or measure in the paragraph.
	# ---------
	# haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
	# contains the list of paragraphs in different format,here the list of
	# Haystack Documents is used.
	# threshold: threshold value for the model to keep the results from classifier
	# classifiermodel: you can pass the classifier model directly,which takes priority
	# however if not then looks for model in streamlit session.
	# In case of streamlit avoid passing the model directly.
	# Returns
	# ----------
	# df: Dataframe with two columns['SDG:int', 'text']
	# x: Series object with the unique SDG covered in the document uploaded and
	# the number of times it is covered/discussed/count_of_paragraphs.
	# """

	# logging.info("Working on target/action identification")

	# haystack_doc['Vulnerability Label'] = 'NA'

	# if not classifier_model:

	# classifier_model = st.session_state['target_classifier']

	# # Get predictions
	# predictions = classifier_model(list(haystack_doc.text))

	# # Get labels for predictions
	# pred_labels = getlabels(predictions)

	# # Save labels
	# haystack_doc['Target Label'] = pred_labels


	# # logging.info("Working on action/target extraction")
	# # if not classifier_model:
	# # classifier_model = st.session_state['target_classifier']

	# # results = classifier_model(list(haystack_doc.text))
	# # labels_= [(l[0]['label'],
	# # l[0]['score']) for l in results]


	# # df1 = DataFrame(labels_, columns=["Target Label","Target Score"])
	# # df = pd.concat([haystack_doc,df1],axis=1)

	# # df = df.sort_values(by="Target Score", ascending=False).reset_index(drop=True)
	# # df['Target Score'] = df['Target Score'].round(2)
	# # df.index += 1
	# # # df['Label_def'] = df['Target Label'].apply(lambda i: _lab_dict[i])

	# return haystack_doc