sabazo commited on
Commit
0fa9cde
·
unverified ·
2 Parent(s): d6f22d3 6e17460

Merge pull request #53 from almutareb/51-add-a-function-to-create-keywords-for-each-chunk

Browse files
config.py CHANGED
@@ -9,6 +9,7 @@ SQLITE_FILE_NAME = os.getenv('SOURCES_CACHE')
9
  PERSIST_DIRECTORY = os.getenv('VECTOR_DATABASE_LOCATION')
10
  EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
11
  SEVEN_B_LLM_MODEL = os.getenv("SEVEN_B_LLM_MODEL")
 
12
 
13
 
14
  db = DataBaseHandler()
 
9
  PERSIST_DIRECTORY = os.getenv('VECTOR_DATABASE_LOCATION')
10
  EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
11
  SEVEN_B_LLM_MODEL = os.getenv("SEVEN_B_LLM_MODEL")
12
+ BERT_MODEL = os.getenv("BERT_MODEL")
13
 
14
 
15
  db = DataBaseHandler()
example.env CHANGED
@@ -27,3 +27,4 @@ LLM_MODEL="mistralai/Mixtral-8x7B-Instruct-v0.1"
27
  LLM_MODEL_ARGS=
28
 
29
  SEVEN_B_LLM_MODEL="mistralai/Mistral-7B-Instruct-v0.3"
 
 
27
  LLM_MODEL_ARGS=
28
 
29
  SEVEN_B_LLM_MODEL="mistralai/Mistral-7B-Instruct-v0.3"
30
+ BERT_MODEL="paraphrase-multilingual-MiniLM-L12-v2"
rag_app/chains/__init__.py CHANGED
@@ -1,3 +1,4 @@
1
  from rag_app.chains.user_response_sentiment_chain import user_response_sentiment_prompt
2
  from rag_app.chains.generate_document_summary import generate_document_summary_prompt
3
- from rag_app.chains.query_rewritten_chain import query_rewritting_prompt
 
 
1
  from rag_app.chains.user_response_sentiment_chain import user_response_sentiment_prompt
2
  from rag_app.chains.generate_document_summary import generate_document_summary_prompt
3
+ from rag_app.chains.query_rewritten_chain import query_rewritting_prompt
4
+ from rag_app.chains.generate_keywords_chain import generate_keywords_prompt
rag_app/chains/generate_keywords_chain.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.prompts import PromptTemplate
2
+
3
+
4
+ generate_keywords_template = """
5
+ You will be given meta data for a chunk text
6
+ =================
7
+ {chunk_metadata}
8
+ ====================
9
+
10
+ You will be tasked with creating keywords to help a llm better indentify the correct chunk
11
+ to use. Please only return the comma seperate values such that it can easily be parsed.
12
+
13
+
14
+ """
15
+
16
+ generate_keywords_prompt = PromptTemplate.from_template(generate_keywords_template)
17
+
rag_app/utils/generate_keywords_keybert.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple, Dict, Any
2
+ from keybert import KeyBERT
3
+ from config import BERT_MODEL
4
+
5
+ def extract_keywords_from_doc(
6
+ doc: str,
7
+ model_name: str = BERT_MODEL,
8
+ **kwargs: Dict[str, Any]
9
+ ) -> List[Tuple[str, float]]:
10
+ """
11
+ ## Summary
12
+ Extract keywords from a document using the KeyBERT model.
13
+
14
+ ## Parameters:
15
+ doc (str): The document from which to extract keywords.
16
+ model_name (str): The name of the model to use. Default is "paraphrase-multilingual-MiniLM-L12-v2".
17
+ **kwargs (Dict[str, Any]): Additional keyword arguments for the extract_keywords method.
18
+ Possible keyword arguments include:
19
+ - top_n (int): The number of top keywords to return.
20
+ - keyphrase_ngram_range (Tuple[int, int]): The ngram range for the keyphrases.
21
+ - stop_words (str): The stop words to use.
22
+ - use_maxsum (bool): Whether to use Max Sum Similarity.
23
+ - use_mmr (bool): Whether to use Maximal Marginal Relevance.
24
+ - diversity (float): The diversity parameter for MMR.
25
+ - nr_candidates (int): The number of candidates for Max Sum Similarity.
26
+
27
+ ## Returns:
28
+ List[Tuple[str, float]]: A list of tuples containing keywords and their corresponding scores.
29
+
30
+ ## Example:
31
+ doc = \"\"\"
32
+ Supervised learning is the machine learning task of learning a function that
33
+ maps an input to an output based on example input-output pairs. It infers a
34
+ function from labeled training data consisting of a set of training examples.
35
+ In supervised learning, each example is a pair consisting of an input object
36
+ (typically a vector) and a desired output value (also called the supervisory signal).
37
+ A supervised learning algorithm analyzes the training data and produces an inferred function,
38
+ which can be used for mapping new examples. An optimal scenario will allow for the
39
+ algorithm to correctly determine the class labels for unseen instances. This requires
40
+ the learning algorithm to generalize from the training data to unseen situations in a
41
+ 'reasonable' way (see inductive bias).
42
+ \"\"\"
43
+
44
+ keywords = extract_keywords_from_doc(
45
+ doc,
46
+ top_n=10,
47
+ keyphrase_ngram_range=(1, 2),
48
+ stop_words='english',
49
+ use_maxsum=True,
50
+ nr_candidates=20
51
+ )
52
+ print(keywords)
53
+ """
54
+ kw_model = KeyBERT(model=model_name)
55
+ keywords = kw_model.extract_keywords(doc, **kwargs)
56
+ return keywords
57
+
58
+ if __name__ == "__main__":
59
+
60
+ # Example usage
61
+ doc = """
62
+ Supervised learning is the machine learning task of learning a function that
63
+ maps an input to an output based on example input-output pairs. It infers a
64
+ function from labeled training data consisting of a set of training examples.
65
+ In supervised learning, each example is a pair consisting of an input object
66
+ (typically a vector) and a desired output value (also called the supervisory signal).
67
+ A supervised learning algorithm analyzes the training data and produces an inferred function,
68
+ which can be used for mapping new examples. An optimal scenario will allow for the
69
+ algorithm to correctly determine the class labels for unseen instances. This requires
70
+ the learning algorithm to generalize from the training data to unseen situations in a
71
+ 'reasonable' way (see inductive bias).
72
+ """
73
+
74
+ # Example of passing additional keyword arguments
75
+ keywords = extract_keywords_from_doc(
76
+ doc,
77
+ top_n=10,
78
+ keyphrase_ngram_range=(1, 2),
79
+ stop_words='english',
80
+ use_maxsum=True,
81
+ nr_candidates=20
82
+ )
83
+ print(keywords)
requirements.txt CHANGED
@@ -15,4 +15,5 @@ gradio
15
  boto3
16
  rich
17
  sqlmodel
18
- python-dotenv
 
 
15
  boto3
16
  rich
17
  sqlmodel
18
+ python-dotenv
19
+ keybert