Spaces:

lewispons
/

GrammarGuru

Sleeping

File size: 6,995 Bytes

2d4243e

import pandas as pd
from gensim import corpora
from gensim import similarities
from gensim.models import TfidfModel
from gensim.parsing import strip_tags, strip_numeric, \
    strip_multiple_whitespaces, stem_text, strip_punctuation, \
    remove_stopwords, preprocess_string
import re

from typing import List
from utils.constants import TEST_INPUTS
import argparse
from random import choice

import sys



SAMPLES = 3000
CORPUS_DICTIONARY_PATH="30Ktokens"
ARXIV_DATASR_PATH = "/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/reduced_arxiv_papers.parquet.gzip"
SAVE_DICT = False
QUERY = ""

transform_to_lower = lambda s: s.lower()
remove_single_char = lambda s: re.sub(r'\s+\w{1}\s+', '', s)

cleaning_filters = [
    strip_tags,
    strip_numeric,
    strip_punctuation, 
    strip_multiple_whitespaces, 
    transform_to_lower,
    remove_stopwords,
    remove_single_char
]

def gensim_tokenizer(docs: List[str]):
    """
    Tokenizes a list of strings using a series of cleaning filters.

    Args:
        docs (List[str]): A list of strings to be tokenized.

    Returns:
        List[List[str]]: A list of tokenized documents, where each document is represented as a list of tokens.
    """
    tokenized_docs = list()
    for doc in docs:
        processed_words = preprocess_string(doc, cleaning_filters)
        tokenized_docs.append(processed_words)
    
    return tokenized_docs


def cleaning_pipe(document):
    """
    Applies a series of cleaning steps to a document.

    Args:
        document (str): The document to be cleaned.

    Returns:
        list: A list of processed words after applying the cleaning filters.
    """
    # Invoking gensim.parsing.preprocess_string method with set of filters
    processed_words = preprocess_string(document, cleaning_filters)
    return processed_words


def get_gensim_dictionary(tokenized_docs: List[str], dict_name: str = "corpus", save_dict: bool = False):
    """
        Create dictionary of words in preprocessed corpus and saves the dict object
    """
    dictionary = corpora.Dictionary(tokenized_docs)
    if save_dict:    
        parent_folder = "/Users/luis.morales/Desktop/arxiv-paper-recommender/models/nlp_dictionaries"
        dictionary.save(f'{parent_folder}/{dict_name}.dict')
    return dictionary


def get_closest_n(query: str, n: int):
    '''
    Retrieves the top matching documents as per cosine similarity
    between the TF-IDF vector of the query and all documents.

    Args:
        query (str): The query string to find matching documents.
        n (int): The number of closest documents to retrieve.

    Returns:
        numpy.ndarray: An array of indices representing the top matching documents.
    '''
    # Clean the query document using cleaning_pipe function
    query_document = cleaning_pipe(query)

    # Convert the query document to bag-of-words representation
    query_bow = dictionary.doc2bow(query_document)

    # Calculate similarity scores between the query and all documents using TF-IDF model
    sims = index[tfidf_model[query_bow]]

    # Get the indices of the top n closest documents based on similarity scores
    top_idx = sims.argsort()[-1 * n:][::-1]

    return top_idx


def get_recomendations_metadata(query: str, df: pd.DataFrame, n: int):
    '''
    Retrieves metadata recommendations based on a query using cosine similarity.

    Args:
        query (str): The query string for which recommendations are sought.
        n (int): The number of recommendations to retrieve.
        df (pd.DataFrame): The DataFrame containing metadata information.

    Returns:
        pd.DataFrame: A DataFrame containing the recommended metadata, reset with a new index.
    '''
    # Get the indices of the closest matching documents based on the query
    recommendations_idxs = get_closest_n(query, n)
    
    # Retrieve the recommended metadata rows from the DataFrame based on the indices
    recommendations_metadata = df.iloc[recommendations_idxs]
    
    # Reset the index of the recommended metadata DataFrame
    recommendations_metadata = recommendations_metadata.reset_index(drop=True)
    
    return recommendations_metadata

if __name__ == "__main__":
    """
        Example: 
        python script.py --samples 3000 --corpus_dictionary_path "30Ktokens.dict" --arxiv_datasr_path "/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/reduced_arxiv_papers.parquet.gzip" --save_dict --query "your query here"

    """
    # Define and parse command-line arguments
    parser = argparse.ArgumentParser(description='ArXiv Paper Recommender CLI')
    parser.add_argument('--samples', default=30000, type=int, help='Number of samples to consider')
    parser.add_argument('--corpus_dictionary_path', default=None ,type=str, help='Path to the corpus dictionary')
    parser.add_argument('--save_dict', default=False, help='Flag to save the dictionary')
    parser.add_argument('--arxiv_dataset_path',
                        default="/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/reduced_arxiv_papers.parquet.gzip",
                        type=str, help='Path to the ARXIV parquet source')
    parser.add_argument('--query', default=None, type=str, help='User query')
    args = parser.parse_args()

    num_samples = args.samples
    corpus_dictionary_path = args.corpus_dictionary_path
    arxiv_dataset_path = args.arxiv_dataset_path
    save_dict = args.save_dict
    query = args.query

    print("Parameters:")
    print(f"num_samples: {num_samples}, type: {type(num_samples)}")
    print(f"corpus_dictionary_path: {corpus_dictionary_path}, type: {type(corpus_dictionary_path)}")
    print(f"arxiv_dataset_path: {arxiv_dataset_path}, type: {type(arxiv_dataset_path)}")
    print(f"save_dict: {save_dict}, type: {type(save_dict)}")
    print(f"query: {query}, type: {type(query)}")
    

    if num_samples is None:
        df = pd.read_parquet(arxiv_dataset_path)
    df = pd.read_parquet(arxiv_dataset_path).sample(num_samples).reset_index(drop=True)
    

    corpus = df['cleaned_abstracts'].to_list()
    tokenized_corpus = gensim_tokenizer(corpus)

    dictionary = get_gensim_dictionary(
        tokenized_docs=tokenized_corpus, 
        dict_name=corpus_dictionary_path, 
        save_dict=save_dict
        )

    BoW_corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in tokenized_corpus]

    tfidf_model = TfidfModel(BoW_corpus)

    index = similarities.SparseMatrixSimilarity(tfidf_model[BoW_corpus], num_features=len(dictionary))

    if query is None:
        query = choice(TEST_INPUTS)

    results_df = get_recomendations_metadata(query=query, df=df, n=3)


    for abstract in list(zip(results_df['abstract'].to_list(), results_df['title'].to_list())):
        print(f"User Request ---- : \n {query}")
        print(f"User Request ---- : \n ")
        print(f"Title: {abstract[1]}")
        print(f"Abstract: {abstract[0]}\n")
        print(f"--------------------------")