Spaces:
Sleeping
Sleeping
File size: 8,158 Bytes
74c716c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 |
# MIT License
#
# Copyright (c) 2023 Victor Calderon
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import logging
from typing import Dict, Optional
import numpy as np
import pandas as pd
import torch
from datasets import Dataset
from sentence_transformers import SentenceTransformer
from src.utils import default_variables as dv
__author__ = ["Victor Calderon"]
__copyright__ = ["Copyright 2023 Victor Calderon"]
__all__ = ["SemanticSearchEngine"]
logger = logging.getLogger(__name__)
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s]: %(message)s",
)
logger.setLevel(logging.INFO)
# --------------------------- CLASS DEFINITIONS -------------------------------
class SemanticSearchEngine(object):
"""
Class object for running Semantic Search on the input dataset.
"""
def __init__(self, **kwargs):
"""
Class object for running Semantic Search on the input dataset.
"""
# --- Defining variables
# Device to use, i.e. CPU or GPU
self.device = self._get_device()
# Embedder model to use
self.model = "paraphrase-mpnet-base-v2"
# Defining the embedder
self.embedder = self._get_embedder()
# Corpus embeddings
self.source_colname = kwargs.get(
"source_colname",
"summary",
)
self.embeddings_colname = kwargs.get(
"embeddings_colname",
dv.embeddings_colname,
)
# Variables used for running semantic search
self.corpus_dataset_with_faiss_index = kwargs.get(
"corpus_dataset_with_faiss_index"
)
def _get_device(self) -> str:
"""
Method for determining the device to use.
Returns
----------
device_type : str
Type of device to use (e.g. 'cpu' or 'cuda').
Options:
- ``cpu`` : Uses a CPU.
- ``cuda`` : Uses a GPU.
"""
# Determining the type of device to use
device_type = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f">> Running on a '{device_type.upper()}' device")
return device_type
def _get_embedder(self):
"""
Method for extracting the Embedder model.
Returns
---------
embedder : model
Variable corresponding to the Embeddings models.
"""
embedder = SentenceTransformer(self.model)
embedder.to(self.device)
return embedder
def generate_corpus_index_and_embeddings(
self,
corpus_dataset: Dataset,
) -> Dataset:
"""
Method for generating the Text Embeddings and FAISS indices from
the input dataset.
Parameters
------------
corpus_dataset : datasets.Dataset
Dataset containing the text to use to create the text
embeddings and FAISS indices.
Returns
----------
corpus_dataset_with_embeddings : datasets.Dataset
Dataset containing the original data rom ``corpus_dataset``
plus the corresponding text embeddings of the ``source_colname``
column.
"""
torch.set_grad_enabled(False)
# --- Generate text embeddings for the source column
corpus_dataset_with_embeddings = corpus_dataset.map(
lambda corpus: {
self.embeddings_colname: self.embedder.encode(
corpus[self.source_colname]
)
},
batched=True,
desc="Computing Semantic Search Embeddings",
)
# --- Adding FAISS index
corpus_dataset_with_embeddings.add_faiss_index(
column=self.embeddings_colname,
faiss_verbose=True,
device=None if self.device == "cpu" else 1,
)
return corpus_dataset_with_embeddings
def run_semantic_search(
self,
query: str,
top_n: Optional[int] = 5,
) -> Dict: # sourcery skip: extract-duplicate-method
"""
Method for running a semantic search on a query after having
created the corpus of the text embeddings.
Parameters
--------------
query : str
Text query to use for searching the database.
top_n : int, optional
Variable corresponding to the 'Top N' values to return based on the
similarity score between the input query and the corpus. This
variable is set to ``10`` by default.
Returns
---------
match_results : dict
Dictionary containing the metadata of each of the articles
that were in the Top-N in terms of being most similar to the
input query ``query``.
"""
# --- Checking input parameters
# 'query' - Type
query_type_arr = (str,)
if not isinstance(query, query_type_arr):
msg = ">> 'query' ({}) is not a valid input type ({})".format(
type(query), query_type_arr
)
logger.error(msg)
raise TypeError(msg)
# 'top_n' - Type
top_n_type_arr = (int,)
if not isinstance(top_n, top_n_type_arr):
msg = ">> 'top_n' ({}) is not a valid input type ({})".format(
type(top_n), top_n_type_arr
)
logger.error(msg)
raise TypeError(msg)
# 'top_n' - Value
if top_n <= 0:
msg = f">> 'top_n' ({top_n}) must be larger than '0'!"
logger.error(msg)
raise ValueError(msg)
# --- Checking that the encoder has been indexed correctly
if self.corpus_dataset_with_faiss_index is None:
msg = ">>> The FAISS index was not properly set!"
logger.error(msg)
raise ValueError(msg)
# --- Encode the input query and extract the embedding
query_embedding = self.embedder.encode(query)
# --- Extracting the top-N results
(
scores,
results,
) = self.corpus_dataset_with_faiss_index.get_nearest_examples(
self.embeddings_colname,
query_embedding,
k=top_n,
)
# --- Sorting from highest to lowest
# NOTE: We need to deconstruct the 'results' to be able to organize
# the results
parsed_results = pd.DataFrame.from_dict(
data=results,
orient="columns",
)
parsed_results.loc[:, "relevance"] = scores
# Sorting in descending order
parsed_results = parsed_results.sort_values(
by=["relevance"],
ascending=False,
).reset_index(drop=True)
# Casting data type for the 'relevance'
parsed_results.loc[:, "relevance"] = parsed_results["relevance"].apply(
lambda x: str(np.round(x, 5))
)
# Only keeping certain columns
columns_to_keep = ["_id", "title", "relevance", "content"]
return parsed_results[columns_to_keep].to_dict(orient="index")
|