Spaces:

lewispons
/

GrammarGuru

Sleeping

File size: 10,024 Bytes

2d4243e

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from gensim import corpora\n",
    "from gensim.similarities import SparseMatrixSimilarity\n",
    "from gensim.models import TfidfModel\n",
    "import pandas as pd\n",
    "import gensim\n",
    "import pprint\n",
    "from gensim import corpora\n",
    "from gensim.utils import simple_preprocess\n",
    "from gensim.models import TfidfModel\n",
    "from gensim.parsing import strip_tags, strip_numeric, \\\n",
    "    strip_multiple_whitespaces, stem_text, strip_punctuation, \\\n",
    "    remove_stopwords, preprocess_string\n",
    "import re\n",
    "import os\n",
    "from typing import List"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'strip_tags' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[6], line 5\u001b[0m\n\u001b[1;32m      1\u001b[0m transform_to_lower \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mlambda\u001b[39;00m s: s\u001b[38;5;241m.\u001b[39mlower()\n\u001b[1;32m      2\u001b[0m remove_single_char \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mlambda\u001b[39;00m s: re\u001b[38;5;241m.\u001b[39msub(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms+\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mw\u001b[39m\u001b[38;5;132;01m{1}\u001b[39;00m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms+\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m'\u001b[39m, s)\n\u001b[1;32m      4\u001b[0m cleaning_filters \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m----> 5\u001b[0m     \u001b[43mstrip_tags\u001b[49m,\n\u001b[1;32m      6\u001b[0m     strip_numeric,\n\u001b[1;32m      7\u001b[0m     strip_punctuation, \n\u001b[1;32m      8\u001b[0m     strip_multiple_whitespaces, \n\u001b[1;32m      9\u001b[0m     transform_to_lower,\n\u001b[1;32m     10\u001b[0m     remove_stopwords,\n\u001b[1;32m     11\u001b[0m     remove_single_char\n\u001b[1;32m     12\u001b[0m ]\n\u001b[1;32m     14\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mgensim_tokenizer\u001b[39m(docs: List[\u001b[38;5;28mstr\u001b[39m]):\n\u001b[1;32m     15\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m     16\u001b[0m \u001b[38;5;124;03m    Tokenizes a list of strings using a series of cleaning filters.\u001b[39;00m\n\u001b[1;32m     17\u001b[0m \n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     22\u001b[0m \u001b[38;5;124;03m        List[List[str]]: A list of tokenized documents, where each document is represented as a list of tokens.\u001b[39;00m\n\u001b[1;32m     23\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n",
      "\u001b[0;31mNameError\u001b[0m: name 'strip_tags' is not defined"
     ]
    }
   ],
   "source": [
    "\n",
    "transform_to_lower = lambda s: s.lower()\n",
    "remove_single_char = lambda s: re.sub(r'\\s+\\w{1}\\s+', '', s)\n",
    "\n",
    "cleaning_filters = [\n",
    "    strip_tags,\n",
    "    strip_numeric,\n",
    "    strip_punctuation, \n",
    "    strip_multiple_whitespaces, \n",
    "    transform_to_lower,\n",
    "    remove_stopwords,\n",
    "    remove_single_char\n",
    "]\n",
    "\n",
    "def gensim_tokenizer(docs: List[str]):\n",
    "    \"\"\"\n",
    "    Tokenizes a list of strings using a series of cleaning filters.\n",
    "\n",
    "    Args:\n",
    "        docs (List[str]): A list of strings to be tokenized.\n",
    "\n",
    "    Returns:\n",
    "        List[List[str]]: A list of tokenized documents, where each document is represented as a list of tokens.\n",
    "    \"\"\"\n",
    "    tokenized_docs = list()\n",
    "    for doc in docs:\n",
    "        processed_words = preprocess_string(doc, cleaning_filters)\n",
    "        tokenized_docs.append(processed_words)\n",
    "    \n",
    "    return tokenized_docs\n",
    "\n",
    "\n",
    "def cleaning_pipe(document):\n",
    "    \"\"\"\n",
    "    Applies a series of cleaning steps to a document.\n",
    "\n",
    "    Args:\n",
    "        document (str): The document to be cleaned.\n",
    "\n",
    "    Returns:\n",
    "        list: A list of processed words after applying the cleaning filters.\n",
    "    \"\"\"\n",
    "    # Invoking gensim.parsing.preprocess_string method with set of filters\n",
    "    processed_words = preprocess_string(document, cleaning_filters)\n",
    "    return processed_words\n",
    "\n",
    "\n",
    "def get_gensim_dictionary(tokenized_docs: List[str], dict_name: str = \"corpus\", save_dict: bool = False):\n",
    "    \"\"\"\n",
    "        Create dictionary of words in preprocessed corpus and saves the dict object\n",
    "    \"\"\"\n",
    "    dictionary = corpora.Dictionary(tokenized_docs)\n",
    "    if save_dict:    \n",
    "        parent_folder = \"/Users/luis.morales/Desktop/arxiv-paper-recommender/models/nlp_dictionaries\"\n",
    "        dictionary.save(f'{parent_folder}/{dict_name}.dict')\n",
    "    return dictionary\n",
    "\n",
    "\n",
    "def get_closest_n(index_matrix: SparseMatrixSimilarity, query: str, n: int):\n",
    "    '''\n",
    "    Retrieves the top matching documents as per cosine similarity\n",
    "    between the TF-IDF vector of the query and all documents.\n",
    "\n",
    "    Args:\n",
    "        query (str): The query string to find matching documents.\n",
    "        n (int): The number of closest documents to retrieve.\n",
    "\n",
    "    Returns:\n",
    "        numpy.ndarray: An array of indices representing the top matching documents.\n",
    "    '''\n",
    "    # Clean the query document using cleaning_pipe function\n",
    "    query_document = cleaning_pipe(query)\n",
    "\n",
    "    # Convert the query document to bag-of-words representation\n",
    "    query_bow = dictionary.doc2bow(query_document)\n",
    "\n",
    "    # Calculate similarity scores between the query and all documents using TF-IDF model\n",
    "    sims = index_matrix[index_matrix[query_bow]]\n",
    "\n",
    "    # Get the indices of the top n closest documents based on similarity scores\n",
    "    top_idx = sims.argsort()[-1 * n:][::-1]\n",
    "\n",
    "    return top_idx\n",
    "\n",
    "\n",
    "def get_recomendations_metadata(query: str, df: pd.DataFrame, n: int):\n",
    "    '''\n",
    "    Retrieves metadata recommendations based on a query using cosine similarity.\n",
    "\n",
    "    Args:\n",
    "        query (str): The query string for which recommendations are sought.\n",
    "        n (int): The number of recommendations to retrieve.\n",
    "        df (pd.DataFrame): The DataFrame containing metadata information.\n",
    "\n",
    "    Returns:\n",
    "        pd.DataFrame: A DataFrame containing the recommended metadata, reset with a new index.\n",
    "    '''\n",
    "    # Get the indices of the closest matching documents based on the query\n",
    "    recommendations_idxs = get_closest_n(query, n)\n",
    "    \n",
    "    # Retrieve the recommended metadata rows from the DataFrame based on the indices\n",
    "    recommendations_metadata = df.iloc[recommendations_idxs]\n",
    "    \n",
    "    # Reset the index of the recommended metadata DataFrame\n",
    "    recommendations_metadata = recommendations_metadata.reset_index(drop=True)\n",
    "    \n",
    "    return recommendations_metadata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "corpus = corpora.Dictionary.load()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "td_idf_model = TfidfModel.load(\"/Users/luis.morales/Desktop/arxiv-paper-recommender/models/tfidf/SemanticSherlock.model\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "similarities = SparseMatrixSimilarity.load(\"/Users/luis.morales/Desktop/arxiv-paper-recommender/models/similarities_matrix/LanguageLiberatorSimilarities/LanguageLiberator\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "    query = args.query\n",
    "    \n",
    "    df = pd.read_parquet(\"/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/reduced_arxiv_papers.parquet.gzip\")\n",
    "    \n",
    "    dict_corpus = Dictionary.load(\"/Users/luis.morales/Desktop/arxiv-paper-recommender/models/dictionaries/LanguageLiberator.dict\")\n",
    "    \n",
    "    td_idf_model = TfidfModel.load(\"/Users/luis.morales/Desktop/arxiv-paper-recommender/models/tfidf/SemanticSherlock.model\")\n",
    "    \n",
    "    similarities = SparseMatrixSimilarity.load(\"/Users/luis.morales/Desktop/arxiv-paper-recommender/models/similarities_matrix/LanguageLiberatorSimilarities/LanguageLiberator\")\n",
    "    \n",
    "    results_df = get_recomendations_metadata(query=query, df=df, n=3)\n",
    "    print(results_df.head())"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.11.4 ('arxiv-env': venv)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "aae17c2ae2f38cc6f211be9b71a2aa280701d8462782cbc1f67caa83a1603363"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}