from transformers import pipeline from tqdm.auto import tqdm import pandas as pd from transformers import AutoTokenizer import nltk # Download the nltk data if not present nltk.download('punkt_tab') nltk.download('punkt') class WBGDocTopic: """ A class to handle document topic suggestion using multiple pre-trained text classification models. This class loads a set of text classification models from Hugging Face's model hub and provides a method to suggest topics for input documents based on the aggregated classification results from all the models. Attributes: ----------- classifiers : dict A dictionary mapping model names to corresponding classification pipelines. It holds instances of Hugging Face's `pipeline` used for text classification. Methods: -------- __init__(classifiers: dict = None) Initializes the `WBGDocTopic` instance. If no classifiers are provided, it loads a default set of classifiers by calling `load_classifiers`. load_classifiers() Loads a predefined set of document topic classifiers into the `classifiers` dictionary. It uses `tqdm` to display progress as the classifiers are loaded. suggest_topics(input_docs: str | list[str]) -> list Suggests topics for the given document or list of documents. It runs each document through all classifiers, averages their scores, and returns a list of dictionaries where each dictionary contains the mean and standard deviation of the topic scores per document. Parameters: ----------- input_docs : str or list of str A single document or a list of documents for which to suggest topics. Returns: -------- list A list of dictionaries, where each dictionary represents the suggested topics for each document, along with the mean and standard deviation of the topic classification scores. """ def __init__(self, classifiers: dict = None, device: str = None): self.classifiers = classifiers or {} self.device = device if classifiers is None: self.load_classifiers() def load_classifiers(self): num_evals = 5 num_train = 5 tokenizer = AutoTokenizer.from_pretrained("avsolatorio/doc-topic-model_eval-04_train-03") for i in tqdm(range(num_evals)): for j in tqdm(range(num_train)): if i == j: continue model_name = f"avsolatorio/doc-topic-model_eval-{i:02}_train-{j:02}" classifier = pipeline("text-classification", model=model_name, tokenizer=tokenizer, top_k=None, device=self.device) self.classifiers[model_name] = classifier def suggest_topics(self, input_docs: str | list[str]): if isinstance(input_docs, str): input_docs = [input_docs] doc_outs = {i: [] for i in range(len(input_docs))} topics = [] for _, classifier in self.classifiers.items(): for doc_idx, doc in enumerate(classifier(input_docs)): doc_outs[doc_idx].append(pd.DataFrame.from_records(doc, index="label")) for doc_idx, outs in doc_outs.items(): all_scores = pd.concat(outs, axis=1) mean_probs = all_scores.mean(axis=1).sort_values(ascending=False) std_probs = all_scores.std(axis=1).loc[mean_probs.index] output = pd.DataFrame({"score_mean": mean_probs, "score_std": std_probs}) output["doc_idx"] = doc_idx output.reset_index(inplace=True) topics.append(output.to_dict(orient="records")) return topics