Spaces:

sabazo
/

insurance_advisor_wb

Sleeping

App Files Files Community

sabazo commited on Jul 16, 2024

Commit

0fa9cde

unverified ·

2 Parent(s): d6f22d3 6e17460

Merge pull request #53 from almutareb/51-add-a-function-to-create-keywords-for-each-chunk

Browse files

Files changed (6) hide show

config.py +1 -0
example.env +1 -0
rag_app/chains/__init__.py +2 -1
rag_app/chains/generate_keywords_chain.py +17 -0
rag_app/utils/generate_keywords_keybert.py +83 -0
requirements.txt +2 -1

config.py CHANGED Viewed

@@ -9,6 +9,7 @@ SQLITE_FILE_NAME = os.getenv('SOURCES_CACHE')
 PERSIST_DIRECTORY = os.getenv('VECTOR_DATABASE_LOCATION')
 EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
 SEVEN_B_LLM_MODEL = os.getenv("SEVEN_B_LLM_MODEL")
 db = DataBaseHandler()

 PERSIST_DIRECTORY = os.getenv('VECTOR_DATABASE_LOCATION')
 EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
 SEVEN_B_LLM_MODEL = os.getenv("SEVEN_B_LLM_MODEL")
+BERT_MODEL = os.getenv("BERT_MODEL")
 db = DataBaseHandler()

example.env CHANGED Viewed

@@ -27,3 +27,4 @@ LLM_MODEL="mistralai/Mixtral-8x7B-Instruct-v0.1"
 LLM_MODEL_ARGS=
 SEVEN_B_LLM_MODEL="mistralai/Mistral-7B-Instruct-v0.3"

 LLM_MODEL_ARGS=
 SEVEN_B_LLM_MODEL="mistralai/Mistral-7B-Instruct-v0.3"
+BERT_MODEL="paraphrase-multilingual-MiniLM-L12-v2"

rag_app/chains/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from rag_app.chains.user_response_sentiment_chain import user_response_sentiment_prompt
 from rag_app.chains.generate_document_summary import generate_document_summary_prompt
-from rag_app.chains.query_rewritten_chain import query_rewritting_prompt

 from rag_app.chains.user_response_sentiment_chain import user_response_sentiment_prompt
 from rag_app.chains.generate_document_summary import generate_document_summary_prompt
+from rag_app.chains.query_rewritten_chain import query_rewritting_prompt
+from rag_app.chains.generate_keywords_chain import generate_keywords_prompt

rag_app/chains/generate_keywords_chain.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from langchain_core.prompts import PromptTemplate
+generate_keywords_template = """
+You will be given meta data for a chunk text
+=================
+{chunk_metadata}
+====================
+You will be tasked with creating keywords to help a llm better indentify the correct chunk
+to use. Please only return the comma seperate values such that it can easily be parsed.
+"""
+generate_keywords_prompt = PromptTemplate.from_template(generate_keywords_template)

rag_app/utils/generate_keywords_keybert.py ADDED Viewed

	@@ -0,0 +1,83 @@

+from typing import List, Tuple, Dict, Any
+from keybert import KeyBERT
+from config import BERT_MODEL
+def extract_keywords_from_doc(
+    doc: str,
+    model_name: str = BERT_MODEL,
+    **kwargs: Dict[str, Any]
+) -> List[Tuple[str, float]]:
+    """
+    ## Summary
+    Extract keywords from a document using the KeyBERT model.
+    ## Parameters:
+        doc (str): The document from which to extract keywords.
+        model_name (str): The name of the model to use. Default is "paraphrase-multilingual-MiniLM-L12-v2".
+        **kwargs (Dict[str, Any]): Additional keyword arguments for the extract_keywords method.
+            Possible keyword arguments include:
+                - top_n (int): The number of top keywords to return.
+                - keyphrase_ngram_range (Tuple[int, int]): The ngram range for the keyphrases.
+                - stop_words (str): The stop words to use.
+                - use_maxsum (bool): Whether to use Max Sum Similarity.
+                - use_mmr (bool): Whether to use Maximal Marginal Relevance.
+                - diversity (float): The diversity parameter for MMR.
+                - nr_candidates (int): The number of candidates for Max Sum Similarity.
+    ## Returns:
+        List[Tuple[str, float]]: A list of tuples containing keywords and their corresponding scores.
+    ## Example:
+        doc = \"\"\"
+        Supervised learning is the machine learning task of learning a function that
+        maps an input to an output based on example input-output pairs. It infers a
+        function from labeled training data consisting of a set of training examples.
+        In supervised learning, each example is a pair consisting of an input object
+        (typically a vector) and a desired output value (also called the supervisory signal).
+        A supervised learning algorithm analyzes the training data and produces an inferred function,
+        which can be used for mapping new examples. An optimal scenario will allow for the
+        algorithm to correctly determine the class labels for unseen instances. This requires
+        the learning algorithm to generalize from the training data to unseen situations in a
+        'reasonable' way (see inductive bias).
+        \"\"\"
+        keywords = extract_keywords_from_doc(
+            doc,
+            top_n=10,
+            keyphrase_ngram_range=(1, 2),
+            stop_words='english',
+            use_maxsum=True,
+            nr_candidates=20
+        )
+        print(keywords)
+    """
+    kw_model = KeyBERT(model=model_name)
+    keywords = kw_model.extract_keywords(doc, **kwargs)
+    return keywords
+if __name__ == "__main__":
+    # Example usage
+    doc = """
+    Supervised learning is the machine learning task of learning a function that
+    maps an input to an output based on example input-output pairs. It infers a
+    function from labeled training data consisting of a set of training examples.
+    In supervised learning, each example is a pair consisting of an input object
+    (typically a vector) and a desired output value (also called the supervisory signal).
+    A supervised learning algorithm analyzes the training data and produces an inferred function,
+    which can be used for mapping new examples. An optimal scenario will allow for the
+    algorithm to correctly determine the class labels for unseen instances. This requires
+    the learning algorithm to generalize from the training data to unseen situations in a
+    'reasonable' way (see inductive bias).
+    """
+    # Example of passing additional keyword arguments
+    keywords = extract_keywords_from_doc(
+        doc,
+        top_n=10,
+        keyphrase_ngram_range=(1, 2),
+        stop_words='english',
+        use_maxsum=True,
+        nr_candidates=20
+    )
+    print(keywords)

requirements.txt CHANGED Viewed

@@ -15,4 +15,5 @@ gradio
 boto3
 rich
 sqlmodel
-python-dotenv

 boto3
 rich
 sqlmodel
+python-dotenv
+keybert