Spaces:

sabazo
/

insurance_advisor_wb

Sleeping

App Files Files Community

sabazo commited on Jul 13, 2024

Commit

f71d212

unverified ·

2 Parent(s): 7a29166 f4bba44

Merge pull request #7 from almutareb/intergrate-database

Browse files

Files changed (7) hide show

app_gui.py +2 -0
config.py +15 -0
rag_app/database/__init__.py +1 -0
rag_app/database/db_handler.py +218 -108
rag_app/database/schema.py +17 -2
rag_app/structured_tools/structured_tools.py +27 -18
rag_app/utils/utils.py +26 -2

app_gui.py CHANGED Viewed

@@ -1,7 +1,9 @@
 # Import Gradio for UI, along with other necessary libraries
 import gradio as gr
 from rag_app.loading_data.load_S3_vector_stores import get_chroma_vs
 from rag_app.agents.react_agent import agent_executor
 get_chroma_vs()

 # Import Gradio for UI, along with other necessary libraries
 import gradio as gr
 from rag_app.loading_data.load_S3_vector_stores import get_chroma_vs
+from rag_app.loading_data.load_S3_vector_stores import get_chroma_vs
 from rag_app.agents.react_agent import agent_executor
+from config import db
 get_chroma_vs()

config.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import os
+from dotenv import load_dotenv
+from rag_app.database.db_handler import DataBaseHandler
+load_dotenv()
+SQLITE_FILE_NAME = os.getenv('SOURCES_CACHE')
+PERSIST_DIRECTORY = os.getenv('VECTOR_DATABASE_LOCATION')
+EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
+db = DataBaseHandler()
+db.create_all_tables()

rag_app/database/__init__.py CHANGED Viewed

	@@ -0,0 +1 @@


1	+ from rag_app.database.db_handler import DataBaseHandler

rag_app/database/db_handler.py CHANGED Viewed

@@ -3,112 +3,222 @@ from rag_app.database.schema import Sources
 from rag_app.utils.logger import get_console_logger
 import os
 from dotenv import load_dotenv
-load_dotenv()
-sqlite_file_name = os.getenv('SOURCES_CACHE')
-sqlite_url = f"sqlite:///{sqlite_file_name}"
-engine = create_engine(sqlite_url, echo=False)
-logger = get_console_logger("db_handler")
-SQLModel.metadata.create_all(engine)
-def read_one(hash_id: dict):
-    with Session(engine) as session:
-        statement = select(Sources).where(Sources.hash_id == hash_id)
-        sources = session.exec(statement).first()
-        return sources
-def add_one(data: dict):
-    with Session(engine) as session:
-        if session.exec(
-            select(Sources).where(Sources.hash_id == data.get("hash_id"))
-        ).first():
-            logger.warning(f"Item with hash_id {data.get('hash_id')} already exists")
-            return None  # or raise an exception, or handle as needed
-        sources = Sources(**data)
-        session.add(sources)
-        session.commit()
-        session.refresh(sources)
-        logger.info(f"Item with hash_id {data.get('hash_id')} added to the database")
-        return sources
-def update_one(hash_id: dict, data: dict):
-    with Session(engine) as session:
-        # Check if the item with the given hash_id exists
-        sources = session.exec(
-            select(Sources).where(Sources.hash_id == hash_id)
-        ).first()
-        if not sources:
-            logger.warning(f"No item with hash_id {hash_id} found for update")
-            return None  # or raise an exception, or handle as needed
-        for key, value in data.items():
-            setattr(sources, key, value)
-        session.commit()
-        logger.info(f"Item with hash_id {hash_id} updated in the database")
-        return sources
-def delete_one(id: int):
-    with Session(engine) as session:
-        # Check if the item with the given hash_id exists
-        sources = session.exec(
-            select(Sources).where(Sources.hash_id == id)
-        ).first()
-        if not sources:
-            logger.warning(f"No item with hash_id {id} found for deletion")
-            return None  # or raise an exception, or handle as needed
-        session.delete(sources)
-        session.commit()
-        logger.info(f"Item with hash_id {id} deleted from the database")
-def add_many(data: list):
-    with Session(engine) as session:
-        for info in data:
-            # Reuse add_one function for each item
-            result = add_one(info)
-            if result is None:
-                logger.warning(
-                    f"Item with hash_id {info.get('hash_id')} could not be added"
-                )
-            else:
-                logger.info(
-                    f"Item with hash_id {info.get('hash_id')} added to the database"
                 )
-        session.commit()  # Commit at the end of the loop
-def delete_many(ids: list):
-    with Session(engine) as session:
-        for id in ids:
-            # Reuse delete_one function for each item
-            result = delete_one(id)
-            if result is None:
-                logger.warning(f"No item with hash_id {id} found for deletion")
-            else:
-                logger.info(f"Item with hash_id {id} deleted from the database")
-        session.commit()  # Commit at the end of the loop
-def read_all(query: dict = None):
-    with Session(engine) as session:
-        statement = select(Sources)
-        if query:
-            statement = statement.where(
-                *[getattr(Sources, key) == value for key, value in query.items()]
-            )
-        sources = session.exec(statement).all()
-        return sources
-def delete_all():
-    with Session(engine) as session:
-        session.exec(Sources).delete()
-        session.commit()
-        logger.info("All items deleted from the database")

 from rag_app.utils.logger import get_console_logger
 import os
 from dotenv import load_dotenv
+import uuid
+from datetime import datetime
+class DataBaseHandler():
+    """
+    A class for managing the database.
+    Attributes:
+        sqlite_file_name (str): The SQLite file name for the database.
+        logger (Logger): The logger for logging database operations.
+        engine (Engine): The SQLAlchemy engine for the database.
+    Methods:
+        create_all_tables: Create all tables in the database.
+        read_one: Read a single entry from the database by its hash_id.
+        add_one: Add a single entry to the database.
+        update_one: Update a single entry in the database by its hash_id.
+        delete_one: Delete a single entry from the database by its id.
+        add_many: Add multiple entries to the database.
+        delete_many: Delete multiple entries from the database by their ids.
+        read_all: Read all entries from the database, optionally filtered by a query.
+        delete_all: Delete all entries from the database.
+    """
+    def __init__(
+        self,
+        sqlite_file_name = os.getenv('SOURCES_CACHE'),
+        logger = get_console_logger("db_handler"),
+        # *args,
+        # **kwargs,
+        ):
+        self.sqlite_file_name = sqlite_file_name
+        self.logger = logger
+        sqlite_url = f"sqlite:///{self.sqlite_file_name}"
+        self.engine = create_engine(sqlite_url, echo=False)
+        self.session_id = str(uuid.uuid4())
+        self.session_date_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+    def create_all_tables(self) -> None:
+        SQLModel.metadata.create_all(self.engine)
+    def create_new_session(self) -> None:
+        """creates a new session_id and date time
+        """
+        self.session_id = str(uuid.uuid4())
+        self.session_date_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+    def read_one(self,hash_id: dict):
+        """
+        Read a single entry from the database by its hash_id.
+        Args:
+            hash_id (dict): Dictionary containing the hash_id to search for.
+        Returns:
+            Sources: The matching entry from the database, or None if no match is found.
+        """
+        with Session(self.engine) as session:
+            statement = select(Sources).where(Sources.hash_id == hash_id)
+            sources = session.exec(statement).first()
+            return sources
+    def add_one(self,data: dict):
+        """
+        Add a single entry to the database.
+        Args:
+            data (dict): Dictionary containing the data for the new entry.
+        Returns:
+            Sources: The added entry, or None if the entry already exists.
+        """
+        with Session(self.engine) as session:
+            if session.exec(
+                select(Sources).where(Sources.hash_id == data.get("hash_id"))
+            ).first():
+                self.logger.warning(f"Item with hash_id {data.get('hash_id')} already exists")
+                return None  # or raise an exception, or handle as needed
+            sources = Sources(**data)
+            session.add(sources)
+            session.commit()
+            session.refresh(sources)
+            self.logger.info(f"Item with hash_id {data.get('hash_id')} added to the database")
+            return sources
+    def update_one(self,hash_id: dict, data: dict):
+        """
+        Update a single entry in the database by its hash_id.
+        Args:
+            hash_id (dict): Dictionary containing the hash_id to search for.
+            data (dict): Dictionary containing the updated data for the entry.
+        Returns:
+            Sources: The updated entry, or None if no match is found.
+        """
+        with Session(self.engine) as session:
+            # Check if the item with the given hash_id exists
+            sources = session.exec(
+                select(Sources).where(Sources.hash_id == hash_id)
+            ).first()
+            if not sources:
+                self.logger.warning(f"No item with hash_id {hash_id} found for update")
+                return None  # or raise an exception, or handle as needed
+            for key, value in data.items():
+                setattr(sources, key, value)
+            session.commit()
+            self.logger.info(f"Item with hash_id {hash_id} updated in the database")
+            return sources
+    def delete_one(self,id: int):
+        """
+        Delete a single entry from the database by its id.
+        Args:
+            id (int): The id of the entry to delete.
+        Returns:
+            None
+        """
+        with Session(self.engine) as session:
+            # Check if the item with the given hash_id exists
+            sources = session.exec(
+                select(Sources).where(Sources.hash_id == id)
+            ).first()
+            if not sources:
+                self.logger.warning(f"No item with hash_id {id} found for deletion")
+                return None  # or raise an exception, or handle as needed
+            session.delete(sources)
+            session.commit()
+            self.logger.info(f"Item with hash_id {id} deleted from the database")
+    def add_many(self,data: list):
+        """
+        Add multiple entries to the database.
+        Args:
+            data (list): List of dictionaries, each containing the data for a new entry.
+        Returns:
+            None
+        """
+        with Session(self.engine) as session:
+            for info in data:
+                # Reuse add_one function for each item
+                result = self.add_one(info)
+                if result is None:
+                    self.logger.warning(
+                        f"Item with hash_id {info.get('hash_id')} could not be added"
+                    )
+                else:
+                    self.logger.info(
+                        f"Item with hash_id {info.get('hash_id')} added to the database"
+                    )
+            session.commit()  # Commit at the end of the loop
+    def delete_many(self,ids: list):
+        """
+        Delete multiple entries from the database by their ids.
+        Args:
+            ids (list): List of ids of the entries to delete.
+        Returns:
+            None
+        """
+        with Session(self.engine) as session:
+            for id in ids:
+                # Reuse delete_one function for each item
+                result = self.delete_one(id)
+                if result is None:
+                    self.logger.warning(f"No item with hash_id {id} found for deletion")
+                else:
+                    self.logger.info(f"Item with hash_id {id} deleted from the database")
+            session.commit()  # Commit at the end of the loop
+    def read_all(self,query: dict = None):
+        """
+        Read all entries from the database, optionally filtered by a query.
+        Args:
+            query (dict, optional): Dictionary containing the query parameters. Defaults to None.
+        Returns:
+            list: List of matching entries from the database.
+        """
+        with Session(self.engine) as session:
+            statement = select(Sources)
+            if query:
+                statement = statement.where(
+                    *[getattr(Sources, key) == value for key, value in query.items()]
                 )
+            sources = session.exec(statement).all()
+            return sources
+    def delete_all(self,):
+        """
+        Delete all entries from the database.
+        Returns:
+            None
+        """
+        with Session(self.engine) as session:
+            session.exec(Sources).delete()
+            session.commit()
+            self.logger.info("All items deleted from the database")

rag_app/database/schema.py CHANGED Viewed

@@ -1,9 +1,22 @@
 from sqlmodel import SQLModel, Field
 from typing import Optional
 import datetime
 class Sources(SQLModel, table=True):
     id: Optional[int] = Field(default=None, primary_key=True)
     url: str = Field()
     title: Optional[str] = Field(default="NA", unique=False)
@@ -11,5 +24,7 @@ class Sources(SQLModel, table=True):
     created_at: float = Field(default=datetime.datetime.now().timestamp())
     summary: str = Field(default="")
     embedded: bool = Field(default=False)
-    __table_args__ = {"extend_existing": True}

 from sqlmodel import SQLModel, Field
 from typing import Optional
 import datetime
 class Sources(SQLModel, table=True):
+    """
+    Database schema for the Sources table.
+    Attributes:
+        id (Optional[int]): The primary key for the table.
+        url (str): The URL of the source.
+        title (Optional[str]): The title of the source.
+        hash_id (str): A unique identifier for the source.
+        created_at (float): Timestamp indicating when the entry was created.
+        summary (str): A summary of the source content.
+        embedded (bool): Flag indicating whether the source is embedded.
+        session_id (str): A unique identifier for the session when the entry was added.
+        session_date_time (str): The timestamp when the session was created.
+    """
     id: Optional[int] = Field(default=None, primary_key=True)
     url: str = Field()
     title: Optional[str] = Field(default="NA", unique=False)
     created_at: float = Field(default=datetime.datetime.now().timestamp())
     summary: str = Field(default="")
     embedded: bool = Field(default=False)
+    session_id: str = Field(default="")
+    session_date_time: str = Field(default="")
+    __table_args__ = {"extend_existing": True}

rag_app/structured_tools/structured_tools.py CHANGED Viewed

@@ -1,7 +1,4 @@
-from langchain.tools import BaseTool, StructuredTool, tool
-from langchain_community.tools import WikipediaQueryRun
-from langchain_community.utilities import WikipediaAPIWrapper
-#from langchain.tools import Tool
 from langchain_google_community import GoogleSearchAPIWrapper
 from langchain_community.embeddings.sentence_transformer import (
     SentenceTransformerEmbeddings,
@@ -14,16 +11,11 @@ import chromadb
 from rag_app.utils.utils import (
     parse_list_to_dicts, format_search_results
 )
-from rag_app.database.db_handler import (
-    add_many
-)
 import os
-# from innovation_pathfinder_ai.utils import create_wikipedia_urls_from_text
-persist_directory = os.getenv('VECTOR_DATABASE_LOCATION')
-embedding_model = os.getenv("EMBEDDING_MODEL")
-if not os.path.exists(persist_directory):
     get_chroma_vs()
 @tool
@@ -32,14 +24,14 @@ def memory_search(query:str) -> str:
         This is your primary source to start your search with checking what you already have learned from the past, before going online."""
     # Since we have more than one collections we should change the name of this tool
     client = chromadb.PersistentClient(
-     path=persist_directory,
     )
     collection_name = os.getenv('CONVERSATION_COLLECTION_NAME')
     #store using envar
     embedding_function = SentenceTransformerEmbeddings(
-        model_name=embedding_model,
         )
     vector_db = Chroma(
@@ -51,8 +43,14 @@ def memory_search(query:str) -> str:
     retriever = vector_db.as_retriever()
     docs = retriever.invoke(query)
     return docs.__str__()
 @tool
 def knowledgeBase_search(query:str) -> str:
     """Suche die interne Datenbank nach passenden Versicherungsprodukten und Informationen zu den Versicherungen"""
@@ -65,7 +63,7 @@ def knowledgeBase_search(query:str) -> str:
     #store using envar
     embedding_function = SentenceTransformerEmbeddings(
-        model_name=embedding_model
         )
     # vector_db = Chroma(
@@ -73,16 +71,22 @@ def knowledgeBase_search(query:str) -> str:
     # #collection_name=collection_name,
     # embedding_function=embedding_function,
     # )
-    vector_db = Chroma(persist_directory=persist_directory, embedding_function=embedding_function)
     retriever = vector_db.as_retriever(search_type="mmr", search_kwargs={'k':5, 'fetch_k':10})
     # This is deprecated, changed to invoke
     # LangChainDeprecationWarning: The method `BaseRetriever.get_relevant_documents` was deprecated in langchain-core 0.1.46 and will be removed in 0.3.0. Use invoke instead.
     docs = retriever.invoke(query)
     for doc in docs:
         print(doc)
     return docs.__str__()
 @tool
 def google_search(query: str) -> str:
     """Verbessere die Ergebnisse durch eine Suche über die Webseite der Versicherung. Erstelle eine neue Suchanfrage, um die Erfolgschancen zu verbesseren."""
@@ -91,10 +95,15 @@ def google_search(query: str) -> str:
     search_results:dict = websearch.results(query, 3)
     print(search_results)
     if len(search_results)>1:
         cleaner_sources =format_search_results(search_results)
         parsed_csources = parse_list_to_dicts(cleaner_sources)
-        add_many(parsed_csources)
     else:
         cleaner_sources = search_results
-    return cleaner_sources.__str__()

+from langchain.tools import tool
 from langchain_google_community import GoogleSearchAPIWrapper
 from langchain_community.embeddings.sentence_transformer import (
     SentenceTransformerEmbeddings,
 from rag_app.utils.utils import (
     parse_list_to_dicts, format_search_results
 )
+import chromadb
 import os
+from config import db, PERSIST_DIRECTORY, EMBEDDING_MODEL
+if not os.path.exists(PERSIST_DIRECTORY):
     get_chroma_vs()
 @tool
         This is your primary source to start your search with checking what you already have learned from the past, before going online."""
     # Since we have more than one collections we should change the name of this tool
     client = chromadb.PersistentClient(
+     path=PERSIST_DIRECTORY,
     )
     collection_name = os.getenv('CONVERSATION_COLLECTION_NAME')
     #store using envar
     embedding_function = SentenceTransformerEmbeddings(
+        model_name=EMBEDDING_MODEL,
         )
     vector_db = Chroma(
     retriever = vector_db.as_retriever()
     docs = retriever.invoke(query)
+    # add the session id to each element in `docs`
+    [i.update({"session_id":db.session_id}) for i in docs]
+    db.add_many(docs)
     return docs.__str__()
 @tool
 def knowledgeBase_search(query:str) -> str:
     """Suche die interne Datenbank nach passenden Versicherungsprodukten und Informationen zu den Versicherungen"""
     #store using envar
     embedding_function = SentenceTransformerEmbeddings(
+        model_name=EMBEDDING_MODEL
         )
     # vector_db = Chroma(
     # #collection_name=collection_name,
     # embedding_function=embedding_function,
     # )
+    vector_db = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embedding_function)
     retriever = vector_db.as_retriever(search_type="mmr", search_kwargs={'k':5, 'fetch_k':10})
     # This is deprecated, changed to invoke
     # LangChainDeprecationWarning: The method `BaseRetriever.get_relevant_documents` was deprecated in langchain-core 0.1.46 and will be removed in 0.3.0. Use invoke instead.
     docs = retriever.invoke(query)
+    # add the session id to each element in `docs`
+    [i.update({"session_id":db.session_id}) for i in docs]
+    db.add_many(docs)
     for doc in docs:
         print(doc)
     return docs.__str__()
 @tool
 def google_search(query: str) -> str:
     """Verbessere die Ergebnisse durch eine Suche über die Webseite der Versicherung. Erstelle eine neue Suchanfrage, um die Erfolgschancen zu verbesseren."""
     search_results:dict = websearch.results(query, 3)
     print(search_results)
     if len(search_results)>1:
+        # add session id
         cleaner_sources =format_search_results(search_results)
         parsed_csources = parse_list_to_dicts(cleaner_sources)
+        # add the session id to each element in `parsed_csources`
+        [i.update({"session_id":db.session_id}) for i in parsed_csources]
+        db.add_many(parsed_csources)
     else:
         cleaner_sources = search_results
+    return cleaner_sources.__str__()

rag_app/utils/utils.py CHANGED Viewed

@@ -2,7 +2,8 @@ import hashlib
 import datetime
 import os
 import uuid
 # from rag_app.utils import logger
 # logger = logger.get_console_logger("utils")
@@ -112,4 +113,27 @@ def generate_uuid() -> str:
     Returns:
         str: A UUID string.
     """
-    return str(uuid.uuid4())

 import datetime
 import os
 import uuid
+from typing import Dict
+import re
 # from rag_app.utils import logger
 # logger = logger.get_console_logger("utils")
     Returns:
         str: A UUID string.
     """
+    return str(uuid.uuid4())
+def extract_responses(text: str) -> Dict[str, str]:
+    """
+    Extracts the user response and AI response from the provided text.
+    Args:
+        text (str): The input text containing user and AI responses.
+    Returns:
+        Dict[str, str]: A dictionary with keys 'USER' and 'AI' containing the respective responses.
+    """
+    user_pattern = re.compile(r'USER: (.*?) \n', re.DOTALL)
+    ai_pattern = re.compile(r'AI: (.*?)$', re.DOTALL)
+    user_match = user_pattern.search(text)
+    ai_match = ai_pattern.search(text)
+    responses = {
+        "USER": user_match.group(1) if user_match else "",
+        "AI": ai_match.group(1) if ai_match else ""
+    }
+    return responses