Spaces:
Sleeping
Sleeping
File size: 6,995 Bytes
2d4243e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
import pandas as pd
from gensim import corpora
from gensim import similarities
from gensim.models import TfidfModel
from gensim.parsing import strip_tags, strip_numeric, \
strip_multiple_whitespaces, stem_text, strip_punctuation, \
remove_stopwords, preprocess_string
import re
from typing import List
from utils.constants import TEST_INPUTS
import argparse
from random import choice
import sys
SAMPLES = 3000
CORPUS_DICTIONARY_PATH="30Ktokens"
ARXIV_DATASR_PATH = "/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/reduced_arxiv_papers.parquet.gzip"
SAVE_DICT = False
QUERY = ""
transform_to_lower = lambda s: s.lower()
remove_single_char = lambda s: re.sub(r'\s+\w{1}\s+', '', s)
cleaning_filters = [
strip_tags,
strip_numeric,
strip_punctuation,
strip_multiple_whitespaces,
transform_to_lower,
remove_stopwords,
remove_single_char
]
def gensim_tokenizer(docs: List[str]):
"""
Tokenizes a list of strings using a series of cleaning filters.
Args:
docs (List[str]): A list of strings to be tokenized.
Returns:
List[List[str]]: A list of tokenized documents, where each document is represented as a list of tokens.
"""
tokenized_docs = list()
for doc in docs:
processed_words = preprocess_string(doc, cleaning_filters)
tokenized_docs.append(processed_words)
return tokenized_docs
def cleaning_pipe(document):
"""
Applies a series of cleaning steps to a document.
Args:
document (str): The document to be cleaned.
Returns:
list: A list of processed words after applying the cleaning filters.
"""
# Invoking gensim.parsing.preprocess_string method with set of filters
processed_words = preprocess_string(document, cleaning_filters)
return processed_words
def get_gensim_dictionary(tokenized_docs: List[str], dict_name: str = "corpus", save_dict: bool = False):
"""
Create dictionary of words in preprocessed corpus and saves the dict object
"""
dictionary = corpora.Dictionary(tokenized_docs)
if save_dict:
parent_folder = "/Users/luis.morales/Desktop/arxiv-paper-recommender/models/nlp_dictionaries"
dictionary.save(f'{parent_folder}/{dict_name}.dict')
return dictionary
def get_closest_n(query: str, n: int):
'''
Retrieves the top matching documents as per cosine similarity
between the TF-IDF vector of the query and all documents.
Args:
query (str): The query string to find matching documents.
n (int): The number of closest documents to retrieve.
Returns:
numpy.ndarray: An array of indices representing the top matching documents.
'''
# Clean the query document using cleaning_pipe function
query_document = cleaning_pipe(query)
# Convert the query document to bag-of-words representation
query_bow = dictionary.doc2bow(query_document)
# Calculate similarity scores between the query and all documents using TF-IDF model
sims = index[tfidf_model[query_bow]]
# Get the indices of the top n closest documents based on similarity scores
top_idx = sims.argsort()[-1 * n:][::-1]
return top_idx
def get_recomendations_metadata(query: str, df: pd.DataFrame, n: int):
'''
Retrieves metadata recommendations based on a query using cosine similarity.
Args:
query (str): The query string for which recommendations are sought.
n (int): The number of recommendations to retrieve.
df (pd.DataFrame): The DataFrame containing metadata information.
Returns:
pd.DataFrame: A DataFrame containing the recommended metadata, reset with a new index.
'''
# Get the indices of the closest matching documents based on the query
recommendations_idxs = get_closest_n(query, n)
# Retrieve the recommended metadata rows from the DataFrame based on the indices
recommendations_metadata = df.iloc[recommendations_idxs]
# Reset the index of the recommended metadata DataFrame
recommendations_metadata = recommendations_metadata.reset_index(drop=True)
return recommendations_metadata
if __name__ == "__main__":
"""
Example:
python script.py --samples 3000 --corpus_dictionary_path "30Ktokens.dict" --arxiv_datasr_path "/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/reduced_arxiv_papers.parquet.gzip" --save_dict --query "your query here"
"""
# Define and parse command-line arguments
parser = argparse.ArgumentParser(description='ArXiv Paper Recommender CLI')
parser.add_argument('--samples', default=30000, type=int, help='Number of samples to consider')
parser.add_argument('--corpus_dictionary_path', default=None ,type=str, help='Path to the corpus dictionary')
parser.add_argument('--save_dict', default=False, help='Flag to save the dictionary')
parser.add_argument('--arxiv_dataset_path',
default="/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/reduced_arxiv_papers.parquet.gzip",
type=str, help='Path to the ARXIV parquet source')
parser.add_argument('--query', default=None, type=str, help='User query')
args = parser.parse_args()
num_samples = args.samples
corpus_dictionary_path = args.corpus_dictionary_path
arxiv_dataset_path = args.arxiv_dataset_path
save_dict = args.save_dict
query = args.query
print("Parameters:")
print(f"num_samples: {num_samples}, type: {type(num_samples)}")
print(f"corpus_dictionary_path: {corpus_dictionary_path}, type: {type(corpus_dictionary_path)}")
print(f"arxiv_dataset_path: {arxiv_dataset_path}, type: {type(arxiv_dataset_path)}")
print(f"save_dict: {save_dict}, type: {type(save_dict)}")
print(f"query: {query}, type: {type(query)}")
if num_samples is None:
df = pd.read_parquet(arxiv_dataset_path)
df = pd.read_parquet(arxiv_dataset_path).sample(num_samples).reset_index(drop=True)
corpus = df['cleaned_abstracts'].to_list()
tokenized_corpus = gensim_tokenizer(corpus)
dictionary = get_gensim_dictionary(
tokenized_docs=tokenized_corpus,
dict_name=corpus_dictionary_path,
save_dict=save_dict
)
BoW_corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in tokenized_corpus]
tfidf_model = TfidfModel(BoW_corpus)
index = similarities.SparseMatrixSimilarity(tfidf_model[BoW_corpus], num_features=len(dictionary))
if query is None:
query = choice(TEST_INPUTS)
results_df = get_recomendations_metadata(query=query, df=df, n=3)
for abstract in list(zip(results_df['abstract'].to_list(), results_df['title'].to_list())):
print(f"User Request ---- : \n {query}")
print(f"User Request ---- : \n ")
print(f"Title: {abstract[1]}")
print(f"Abstract: {abstract[0]}\n")
print(f"--------------------------") |