Spaces:

dataprincess
/

ask-anjibot-anything

Sleeping

App Files Files Community

dataprincess commited on Oct 5, 2024

Commit

f093d4b

verified ·

1 Parent(s): 20768f0

regex

Browse files

Files changed (1) hide show

app.py +31 -36

app.py CHANGED Viewed

@@ -9,8 +9,7 @@ from tqdm.auto import tqdm
 import streamlit as st
 import re
-# Constants (hardcoded)
 FILE_PATH = "anjibot_chunks.json"
 BATCH_SIZE = 384
 INDEX_NAME = "groq-llama-3-rag"
@@ -55,44 +54,42 @@ for i in tqdm(range(0, len(data['id']), BATCH_SIZE)):
     index.upsert(vectors=to_upsert)
 def extract_course_code(text) -> list[str]:
     pattern = r'\b(?:geds?|stats?|maths?|cosc|seng|itgy)\s*\d{3}\b'
     match = re.findall(pattern, text, re.IGNORECASE)
     return match if match else None
-def get_docs(query: str, top_k: int, batch_size: int = 5, threshold: float = 0.66) -> list[str]:
-    queried_course_codes = extract_course_code(query)
-    i = 0
-    relevant_docs = []
-    while True:
         xq = encoder.encode(query)
-        res = index.query(vector=xq.tolist(), top_k=batch_size, include_metadata=True, offset=i)
-        if len(res["matches"]) == 0:
-            break
-        for match in res["matches"]:
-            similarity_score = match['score']
-            content = match["metadata"]['content']
-            if similarity_score >= threshold:
-                if queried_course_codes:
-                    for course_code in queried_course_codes:
-                        if course_code in content:
-                            relevant_docs.append(content)
-                            break
-        if relevant_docs:
-            break
-        i += batch_size
-    if relevant_docs:
-        return relevant_docs
-    else:
-        return ["No exact match found for the course code, even after searching with a higher similarity score."]
 def get_response(query: str, docs: list[str]) -> str:
     system_message = (
@@ -115,8 +112,6 @@ def get_response(query: str, docs: list[str]) -> str:
     )
     return chat_response.choices[0].message.content
 def handle_query(user_query: str):
     # Get relevant documents

 import streamlit as st
 import re
+# Variables
 FILE_PATH = "anjibot_chunks.json"
 BATCH_SIZE = 384
 INDEX_NAME = "groq-llama-3-rag"
     index.upsert(vectors=to_upsert)
 def extract_course_code(text) -> list[str]:
+    # Improved pattern with correct case insensitivity and spacing allowance
     pattern = r'\b(?:geds?|stats?|maths?|cosc|seng|itgy)\s*\d{3}\b'
     match = re.findall(pattern, text, re.IGNORECASE)
     return match if match else None
+def get_docs(query: str, top_k: int) -> list[str]:
+    # Extract course code(s) from the query
+    course_code = extract_course_code(query)
+    exact_matches = []
+    if course_code:
+        # Normalize course_code to lowercase for case-insensitive matching
+        course_code = [code.lower() for code in course_code]
+        # Check for exact match in metadata
+        exact_matches = [
+            x['content'] for x in data['metadata']
+            if any(code in x['content'].lower() for code in course_code)
+        ]
+    # Calculate remaining slots if we have fewer than top_k exact matches
+    remaining_slots = top_k - len(exact_matches)
+    if remaining_slots > 0:
+        # Perform embedding search for either the entire top_k if no exact match, or the remaining slots
         xq = encoder.encode(query)
+        res = index.query(vector=xq.tolist(), top_k=remaining_slots if exact_matches else top_k, include_metadata=True)
+        # Add embedding-based matches (avoiding duplicates)
+        embedding_matches = [x["metadata"]['content'] for x in res["matches"]]
+        # Combine exact matches with embedding matches
+        exact_matches.extend(embedding_matches)
+    # Return the first top_k results
+    return exact_matches[:top_k]
 def get_response(query: str, docs: list[str]) -> str:
     system_message = (
     )
     return chat_response.choices[0].message.content
 def handle_query(user_query: str):
     # Get relevant documents