Spaces:

akashmishra358
/

ZIISA

Sleeping

App Files Files Community

akashmishra358 commited on Dec 17, 2024

Commit

23a1ec8

verified ·

1 Parent(s): c67c488

Upload 7 files

Browse files

Files changed (7) hide show

README.md +3 -8
__init__.py +1 -0
app.py +60 -0
model.py +80 -0
rag.configs.yml +7 -0
requirements.txt +178 -0
search.py +148 -0

README.md CHANGED Viewed

@@ -1,13 +1,8 @@
 ---
-title: NBPlatina
-emoji: 🐢
-colorFrom: indigo
-colorTo: red
 sdk: streamlit
-sdk_version: 1.31.1
 app_file: app.py
-pinned: false
-license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 sdk: streamlit
+sdk_version: 1.33.0
 app_file: app.py
+licese: mit
 ---
+install  nltk.download("punkt")

__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ import search

app.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from search import SemanticSearch, GoogleSearch, Document
+import streamlit as st
+from model import RAGModel, load_configs
+def run_on_start():
+    if "configs" not in st.session_state:
+        st.session_state.configs = configs = load_configs(config_file="rag.configs.yml")
+    if "model" not in st.session_state:
+        st.session_state.model = RAGModel(configs)
+run_on_start()
+def search(query):
+    g = GoogleSearch(query)
+    data = g.all_page_data
+    d = Document(data, min_char_len=st.session_state.configs["document"]["min_char_length"])
+    st.session_state.doc = d.doc()
+st.title("Search Here Instead of Google")
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+if "doc" not in st.session_state:
+    st.session_state.doc = None
+if "refresh" not in st.session_state:
+    st.session_state.refresh = True
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.markdown(message["content"])
+if prompt := st.chat_input("Search Here insetad of Google"):
+    st.chat_message("user").markdown(prompt)
+    st.session_state.messages.append({"role": "user", "content": prompt})
+    configs = st.session_state.configs
+    if st.session_state.refresh:
+        st.session_state.refresh = False
+        search(prompt)
+    s = SemanticSearch(
+        st.session_state.doc,
+        configs["model"]["embeding_model"],
+        configs["model"]["device"],
+    )
+    topk, u = s.semantic_search(query=prompt, k=32)
+    output = st.session_state.model.answer_query(query=prompt, topk_items=topk)
+    response = output
+    with st.chat_message("assistant"):
+        st.markdown(response)
+    st.session_state.messages.append({"role": "assistant", "content": response})

model.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from search import SemanticSearch, GoogleSearch, Document
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from transformers import BitsAndBytesConfig
+from transformers.utils import is_flash_attn_2_available
+import yaml
+import torch
+import nltk
+def load_configs(config_file: str) -> dict:
+    with open(config_file, "r") as f:
+        configs = yaml.safe_load(f)
+    return configs
+class RAGModel:
+    def __init__(self, configs) -> None:
+        self.configs = configs
+        self.device = configs["model"]["device"]
+        model_url = configs["model"]["genration_model"]
+        # quantization_config = BitsAndBytesConfig(
+        #     load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16
+        # )
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_url,
+            torch_dtype=torch.float16,
+            # quantization_config=quantization_config,
+            low_cpu_mem_usage=False,
+            attn_implementation="sdpa",
+        ).to(self.device)
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_url,
+        )
+    def create_prompt(self, query, topk_items: list[str]):
+        context =  "\n-".join(c for c in topk_items)
+        base_prompt = f"""You are an alternate to goole search. Your job is to answer the user query in as detailed manner as possible.
+        you have access to the internet and other relevent data related to the user's question.
+        Give time for yourself to read the context and user query and extract relevent data and then answer the query.
+        make sure your answers is as detailed as posssbile.
+        Do not return thinking process, just return the answer.
+        Give the output structured as a Wikipedia article.
+        Now use the following context items to answer the user query
+        context: {context}
+        user query : {query}
+        """
+        dialog_template = [{"role": "user", "content": base_prompt}]
+        prompt = self.tokenizer.apply_chat_template(
+            conversation=dialog_template, tokenize=False, add_feneration_prompt=True
+        )
+        return prompt
+    def answer_query(self, query: str, topk_items: list[str]):
+        prompt = self.create_prompt(query, topk_items)
+        input_ids = self.tokenizer(prompt, return_tensors="pt").to(self.device)
+        output = self.model.generate(**input_ids, temperature=0.7, max_new_tokens=512, do_sample=True)
+        text = self.tokenizer.decode(output[0])
+        text = text.replace(prompt, "").replace("<bos>", "").replace("<eos>", "")
+        return text
+if __name__ == "__main__":
+    configs = load_configs(config_file="rag.configs.yml")
+    query = "The height of burj khalifa is 1000 meters and it was built in 2023. What is the height of burgj khalifa"
+    # g = GoogleSearch(query)
+    # data = g.all_page_data
+    # d = Document(data, 512)
+    # doc_chunks = d.doc()
+    # s = SemanticSearch(doc_chunks, "all-mpnet-base-v2", "mps")
+    # topk, u = s.semantic_search(query=query, k=32)
+    r = RAGModel(configs)
+    output = r.answer_query(query=query, topk_items=[""])
+    print(output)

rag.configs.yml ADDED Viewed

	@@ -0,0 +1,7 @@

+document:
+  min_char_length: 512
+model:
+  embeding_model: all-mpnet-base-v2
+  genration_model: google/gemma-7b-it
+  device : cuda

requirements.txt ADDED Viewed

	@@ -0,0 +1,178 @@

+accelerate==0.29.2
+albumentations==1.4.3
+altair==5.3.0
+attrs==23.2.0
+beautifulsoup4==4.12.3
+bitsandbytes==0.43.1
+blinker==1.7.0
+cachetools==5.3.3
+certifi==2024.2.2
+charset-normalizer==2.0.4
+click==8.1.7
+colorama==0.4.6
+contourpy==1.2.1
+cycler==0.12.1
+filelock==3.13.1
+fonttools==4.50.0
+fsspec==2024.3.1
+gitdb==4.0.11
+GitPython==3.1.43
+gmpy2==2.1.2
+huggingface-hub==0.22.2
+idna==3.4
+imageio==2.34.0
+importlib_resources==6.4.0
+Jinja2==3.1.3
+joblib==1.3.2
+jsonschema==4.21.1
+jsonschema-specifications==2023.12.1
+kiwisolver==1.4.5
+lazy_loader==0.4
+markdown-it-py==3.0.0
+MarkupSafe==2.1.3
+matplotlib==3.8.4
+mdurl==0.1.2
+mkl-fft==1.3.8
+mkl-random==1.2.4
+mkl-service==2.4.0
+mpmath==1.3.0
+networkx==3.1
+nltk==3.8.1
+numpy==1.26.4
+opencv-python-headless==4.9.0.80
+packaging==24.0
+pandas==2.2.2
+pillow==10.2.0
+pip==23.3.1
+protobuf==4.25.3
+psutil==5.9.8
+pyarrow==16.0.0
+pydeck==0.8.1b0
+Pygments==2.17.2
+pyparsing==3.1.2
+python-dateutil==2.9.0.post0
+pytz==2024.1
+PyYAML==6.0.1
+referencing==0.34.0
+regex==2024.4.16
+requests==2.31.0
+rich==13.7.1
+rpds-py==0.18.0
+safetensors==0.4.3
+scikit-image==0.22.0
+scikit-learn==1.4.1.post1
+scipy==1.13.0
+sentence-transformers==2.7.0
+setuptools==68.2.2
+six==1.16.0
+smmap==5.0.1
+soupsieve==2.5
+streamlit==1.33.0
+sympy==1.12
+tenacity==8.2.3
+threadpoolctl==3.4.0
+tifffile==2024.2.12
+tokenizers==0.15.2
+toml==0.10.2
+toolz==0.12.1
+torch==2.2.2
+torchaudio==2.2.2
+torchvision==0.17.2
+tornado==6.4
+tqdm==4.66.2
+transformers==4.39.3
+typing_extensions==4.9.0
+tzdata==2024.1
+urllib3==2.1.0
+watchdog==4.0.0
+wheel==0.41.2
+zipp==3.18.1
+accelerate==0.29.2
+albumentations==1.4.3
+altair==5.3.0
+attrs==23.2.0
+beautifulsoup4==4.12.3
+bitsandbytes==0.43.1
+blinker==1.7.0
+cachetools==5.3.3
+certifi==2024.2.2
+charset-normalizer==2.0.4
+click==8.1.7
+colorama==0.4.6
+contourpy==1.2.1
+cycler==0.12.1
+filelock==3.13.1
+fonttools==4.50.0
+fsspec==2024.3.1
+gitdb==4.0.11
+GitPython==3.1.43
+gmpy2==2.1.2
+huggingface-hub==0.22.2
+idna==3.4
+imageio==2.34.0
+importlib_resources==6.4.0
+Jinja2==3.1.3
+joblib==1.3.2
+jsonschema==4.21.1
+jsonschema-specifications==2023.12.1
+kiwisolver==1.4.5
+lazy_loader==0.4
+markdown-it-py==3.0.0
+MarkupSafe==2.1.3
+matplotlib==3.8.4
+mdurl==0.1.2
+mkl-fft==1.3.8
+mkl-random==1.2.4
+mkl-service==2.4.0
+mpmath==1.3.0
+networkx==3.1
+nltk==3.8.1
+numpy==1.26.4
+opencv-python-headless==4.9.0.80
+packaging==24.0
+pandas==2.2.2
+pillow==10.2.0
+pip==23.3.1
+protobuf==4.25.3
+psutil==5.9.8
+pyarrow==16.0.0
+pydeck==0.8.1b0
+Pygments==2.17.2
+pyparsing==3.1.2
+python-dateutil==2.9.0.post0
+pytz==2024.1
+PyYAML==6.0.1
+referencing==0.34.0
+regex==2024.4.16
+requests==2.31.0
+rich==13.7.1
+rpds-py==0.18.0
+safetensors==0.4.3
+scikit-image==0.22.0
+scikit-learn==1.4.1.post1
+scipy==1.13.0
+sentence-transformers==2.7.0
+setuptools==68.2.2
+six==1.16.0
+smmap==5.0.1
+soupsieve==2.5
+streamlit==1.33.0
+sympy==1.12
+tenacity==8.2.3
+threadpoolctl==3.4.0
+tifffile==2024.2.12
+tokenizers==0.15.2
+toml==0.10.2
+toolz==0.12.1
+torch==2.2.2
+torchaudio==2.2.2
+torchvision==0.17.2
+tornado==6.4
+tqdm==4.66.2
+transformers==4.39.3
+typing_extensions==4.9.0
+tzdata==2024.1
+urllib3==2.1.0
+watchdog==4.0.0
+wheel==0.41.2
+zipp==3.18.1

search.py ADDED Viewed

	@@ -0,0 +1,148 @@

+from bs4 import BeautifulSoup
+import urllib
+import requests
+import nltk
+import torch
+from typing import Union
+from sentence_transformers import SentenceTransformer, util
+from concurrent.futures import ThreadPoolExecutor, as_completed
+class GoogleSearch:
+    def __init__(self, query: str) -> None:
+        self.query = query
+        escaped_query = urllib.parse.quote_plus(query)
+        self.URL = f"https://www.google.com/search?q={escaped_query}"
+        self.headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3538.102 Safari/537.36"
+        }
+        self.links = self.get_initial_links()
+        self.all_page_data = self.all_pages()
+    def clean_urls(self, anchors: list[str]) -> list[str]:
+        links: list[str] = []
+        for a in anchors:
+            links.append(
+                list(filter(lambda l: l.startswith("url=http"), a["href"].split("&")))
+            )
+        links = [
+            link.split("url=")[-1]
+            for sublist in links
+            for link in sublist
+            if len(link) > 0
+        ]
+        return links
+    def read_url_page(self, url: str) -> str:
+        response = requests.get(url, headers=self.headers)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, "html.parser")
+        return soup.get_text(strip=True)
+    def get_initial_links(self) -> list[str]:
+        """
+        scrape google for the query with keyword based search
+        """
+        print("Searching Google...")
+        response = requests.get(self.URL, headers=self.headers)
+        soup = BeautifulSoup(response.text, "html.parser")
+        anchors = soup.find_all("a", href=True)
+        return self.clean_urls(anchors)
+    def all_pages(self) -> list[tuple[str, str]]:
+        data: list[tuple[str, str]] = []
+        with ThreadPoolExecutor(max_workers=4) as executor:
+            future_to_url = {
+                executor.submit(self.read_url_page, url): url for url in self.links
+            }
+            for future in as_completed(future_to_url):
+                url = future_to_url[future]
+                try:
+                    output = future.result()
+                    data.append((url, output))
+                except requests.exceptions.HTTPError as e:
+                    print(e)
+        # for url in self.links:
+        #     try:
+        #         data.append((url, self.read_url_page(url)))
+        #     except requests.exceptions.HTTPError as e:
+        #         print(e)
+        return data
+class Document:
+    def __init__(self, data: list[tuple[str, str]], min_char_len: int) -> None:
+        """
+        data : list[tuple[str, str]]
+            url and page data
+        """
+        self.data = data
+        self.min_char_len = min_char_len
+    def make_min_len_chunk(self):
+        raise NotImplementedError
+    def chunk_page(
+        self,
+        page_text: str,
+    ) -> list[str]:
+        min_len_chunks: list[str] = []
+        chunk_text = nltk.tokenize.sent_tokenize(page_text)
+        sentence: str = ""
+        for sent in chunk_text:
+            if len(sentence) > self.min_char_len:
+                min_len_chunks.append(sentence)
+                sent = ""
+                sentence = ""
+            else:
+                sentence += sent
+        return min_len_chunks
+    def doc(self) -> tuple[list[str], list[str]]:
+        print("Creating Document...")
+        chunked_data: list[str] = []
+        urls: list[str] = []
+        for url, dataitem in self.data:
+            data = self.chunk_page(dataitem)
+            chunked_data.append(data)
+            urls.append(url)
+        chunked_data = [chunk for sublist in chunked_data for chunk in sublist]
+        return chunked_data, url
+class SemanticSearch:
+    def __init__(
+        self, doc_chunks: tuple[list, list], model_path: str, device: str
+    ) -> None:
+        self.doc_chunks, self.urls = doc_chunks
+        self.st = SentenceTransformer(
+            model_path,
+            device,
+        )
+    def semantic_search(self, query: str, k: int = 10):
+        print("Searching Top k in document...")
+        query_embeding = self.get_embeding(query)
+        doc_embeding = self.get_embeding(self.doc_chunks)
+        scores = util.dot_score(a=query_embeding, b=doc_embeding)[0]
+        top_k = torch.topk(scores, k=k)[1].cpu().tolist()
+        return [self.doc_chunks[i] for i in top_k], self.urls
+    def get_embeding(self, text: Union[list[str], str]):
+        en = self.st.encode(text)
+        return en