Spaces:

S1131
/

Streamlit

Sleeping

App Files Files Community

B commited on Mar 13

Commit

43add07

verified ·

1 Parent(s): 7aba694

other files

Browse files

Files changed (7) hide show

.gitattributes +2 -0
Infy financial report/INFY_2022_2023.pdf +3 -0
Infy financial report/INFY_2023_2024.pdf +3 -0
README.md +41 -12
nltk_data/corpora/stopwords/english +198 -0
requirements.txt +237 -0
utils.py +320 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+Infy[[:space:]]financial[[:space:]]report/INFY_2022_2023.pdf filter=lfs diff=lfs merge=lfs -text
+Infy[[:space:]]financial[[:space:]]report/INFY_2023_2024.pdf filter=lfs diff=lfs merge=lfs -text

Infy financial report/INFY_2022_2023.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:33cd6264b51e3979680d245eb917015058aff9652c3c1d9ee1b46a938272e858
+size 13894776

Infy financial report/INFY_2023_2024.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b0a9bb9e802aff5f09733b8c78c88e9878732ac46e0fb29754c6da87ad47326a
+size 11441269

README.md CHANGED Viewed

@@ -1,12 +1,41 @@
----
-title: Streamlit
-emoji: ⚡
-colorFrom: purple
-colorTo: purple
-sdk: streamlit
-sdk_version: 1.43.2
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+Financial Chatbot for Infosys Financial Reports
+------------------------------------------------
+- This is a Retrieval-Augmented Generation (RAG) chatbot designed to answer questions about Infosys financial statements from the last two years (2022-2024).
+- The chatbot uses open-source models and advanced retrieval techniques to provide accurate and concise answers.
+Project Structure
+------------------
+- The project is organized as follows:
+```
+Financial-Chatbot/
+├── app.py                  # Streamlit application interface
+├── chroma_db/              # Chroma vector database storage
+├── Infy financial report/  # Folder containing Infosys financial PDFs
+│   ├── INFY_2022_2023.pdf
+│   └── INFY_2023_2024.pdf
+├── requirements.txt        # Python dependencies
+├── utils.py                # Core functionality and RAG implementation
+└── README.md               # This file
+```
+Installation
+--------------
+Python Version: ```Python 3.10.xx```
+Python lib requirements: ```pip install -r requirements.txt```
+Place PDFs:
+------------
+- Ensure the Infosys financial reports (INFY_2022_2023.pdf and INFY_2023_2024.pdf) are placed in the Infy financial report/ folder.
+Running the Application
+------------------------
+- To start the chatbot, run the following command:
+```streamlit run app.py --server.enableCORS false```
+- The application will start and provide a local URL (e.g., http://localhost:8501). Open this URL in your browser to interact with the chatbot.

nltk_data/corpora/stopwords/english ADDED Viewed

	@@ -0,0 +1,198 @@

+a
+about
+above
+after
+again
+against
+ain
+all
+am
+an
+and
+any
+are
+aren
+aren't
+as
+at
+be
+because
+been
+before
+being
+below
+between
+both
+but
+by
+can
+couldn
+couldn't
+d
+did
+didn
+didn't
+do
+does
+doesn
+doesn't
+doing
+don
+don't
+down
+during
+each
+few
+for
+from
+further
+had
+hadn
+hadn't
+has
+hasn
+hasn't
+have
+haven
+haven't
+having
+he
+he'd
+he'll
+her
+here
+hers
+herself
+he's
+him
+himself
+his
+how
+i
+i'd
+if
+i'll
+i'm
+in
+into
+is
+isn
+isn't
+it
+it'd
+it'll
+it's
+its
+itself
+i've
+just
+ll
+m
+ma
+me
+mightn
+mightn't
+more
+most
+mustn
+mustn't
+my
+myself
+needn
+needn't
+no
+nor
+not
+now
+o
+of
+off
+on
+once
+only
+or
+other
+our
+ours
+ourselves
+out
+over
+own
+re
+s
+same
+shan
+shan't
+she
+she'd
+she'll
+she's
+should
+shouldn
+shouldn't
+should've
+so
+some
+such
+t
+than
+that
+that'll
+the
+their
+theirs
+them
+themselves
+then
+there
+these
+they
+they'd
+they'll
+they're
+they've
+this
+those
+through
+to
+too
+under
+until
+up
+ve
+very
+was
+wasn
+wasn't
+we
+we'd
+we'll
+we're
+were
+weren
+weren't
+we've
+what
+when
+where
+which
+while
+who
+whom
+why
+will
+with
+won
+won't
+wouldn
+wouldn't
+y
+you
+you'd
+you'll
+your
+you're
+yours
+yourself
+yourselves
+you've

requirements.txt ADDED Viewed

	@@ -0,0 +1,237 @@

+pysqlite3-binary
+absl-py==2.1.0
+accelerate==1.4.0
+aiohappyeyeballs==2.5.0
+aiohttp==3.11.13
+aiosignal==1.3.2
+altair==5.5.0
+annotated-types==0.7.0
+anyio==4.8.0
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asgiref==3.8.1
+asttokens==3.0.0
+astunparse==1.6.3
+async-lru==2.0.4
+async-timeout==4.0.3
+attrs==25.1.0
+babel==2.17.0
+backoff==2.2.1
+bcrypt==4.3.0
+beautifulsoup4==4.13.3
+bitsandbytes==0.45.3
+bleach==6.2.0
+blinker==1.9.0
+build==1.2.2.post1
+cachetools==5.5.2
+certifi==2025.1.31
+cffi==1.17.1
+charset-normalizer==3.4.1
+chroma-hnswlib==0.7.6
+chromadb==0.6.3
+click==8.1.8
+coloredlogs==15.0.1
+comm==0.2.2
+dataclasses-json==0.6.7
+debugpy==1.8.13
+decorator==5.2.1
+defusedxml==0.7.1
+Deprecated==1.2.18
+distro==1.9.0
+durationpy==0.9
+exceptiongroup==1.2.2
+executing==2.2.0
+faiss-cpu==1.10.0
+fastapi==0.115.11
+fastjsonschema==2.21.1
+filelock==3.17.0
+flatbuffers==25.2.10
+fqdn==1.5.1
+frozenlist==1.5.0
+fsspec==2025.3.0
+gast==0.6.0
+gitdb==4.0.12
+GitPython==3.1.44
+google-auth==2.38.0
+google-pasta==0.2.0
+googleapis-common-protos==1.69.1
+greenlet==3.1.1
+grpcio==1.71.0
+h11==0.14.0
+h5py==3.13.0
+httpcore==1.0.7
+httptools==0.6.4
+httpx==0.28.1
+httpx-sse==0.4.0
+huggingface-hub==0.29.3
+humanfriendly==10.0
+idna==3.10
+importlib_metadata==8.5.0
+importlib_resources==6.5.2
+ipykernel==6.29.5
+ipython==8.34.0
+ipywidgets==8.1.5
+isoduration==20.11.0
+jedi==0.19.2
+Jinja2==3.1.6
+joblib==1.4.2
+json5==0.10.0
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+jupyter-events==0.12.0
+jupyter-lsp==2.2.5
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+jupyter_server==2.15.0
+jupyter_server_terminals==0.5.3
+jupyterlab==4.3.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+jupyterlab_widgets==3.0.13
+keras==3.9.0
+kubernetes==32.0.1
+langchain==0.3.20
+langchain-community==0.3.19
+langchain-core==0.3.43
+langchain-huggingface==0.1.2
+langchain-text-splitters==0.3.6
+langsmith==0.3.13
+libclang==18.1.1
+Markdown==3.7
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+marshmallow==3.26.1
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mistune==3.1.2
+ml-dtypes==0.4.1
+mmh3==5.1.0
+monotonic==1.6
+mpmath==1.3.0
+multidict==6.1.0
+mypy-extensions==1.0.0
+namex==0.0.8
+narwhals==1.30.0
+nbclient==0.10.2
+nbconvert==7.16.6
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.4.2
+nltk==3.9.1
+notebook_shim==0.2.4
+numpy==2.0.2
+oauthlib==3.2.2
+onnxruntime==1.21.0
+opentelemetry-api==1.30.0
+opentelemetry-exporter-otlp-proto-common==1.30.0
+opentelemetry-exporter-otlp-proto-grpc==1.30.0
+opentelemetry-instrumentation==0.51b0
+opentelemetry-instrumentation-asgi==0.51b0
+opentelemetry-instrumentation-fastapi==0.51b0
+opentelemetry-proto==1.30.0
+opentelemetry-sdk==1.30.0
+opentelemetry-semantic-conventions==0.51b0
+opentelemetry-util-http==0.51b0
+opt_einsum==3.4.0
+optree==0.14.1
+orjson==3.10.15
+overrides==7.7.0
+packaging==24.2
+pandas==2.2.3
+pandocfilters==1.5.1
+parso==0.8.4
+pexpect==4.9.0
+pillow==11.1.0
+platformdirs==4.3.6
+posthog==3.19.1
+prometheus_client==0.21.1
+prompt_toolkit==3.0.50
+propcache==0.3.0
+protobuf==5.29.3
+psutil==7.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pyarrow==19.0.1
+pyasn1==0.6.1
+pyasn1_modules==0.4.1
+pycparser==2.22
+pydantic==2.10.6
+pydantic-settings==2.8.1
+pydantic_core==2.27.2
+pydeck==0.9.1
+Pygments==2.19.1
+pypdf==5.3.1
+PyPika==0.48.9
+pyproject_hooks==1.2.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-json-logger==3.3.0
+pytz==2025.1
+PyYAML==6.0.2
+pyzmq==26.2.1
+rank-bm25==0.2.2
+referencing==0.36.2
+regex==2024.11.6
+requests==2.32.3
+requests-oauthlib==2.0.0
+requests-toolbelt==1.0.0
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rich==13.9.4
+rpds-py==0.23.1
+rsa==4.9
+safetensors==0.5.3
+scikit-learn==1.6.1
+scipy==1.15.2
+Send2Trash==1.8.3
+sentence-transformers==3.4.1
+shellingham==1.5.4
+six==1.17.0
+smmap==5.0.2
+sniffio==1.3.1
+soupsieve==2.6
+SQLAlchemy==2.0.38
+stack-data==0.6.3
+starlette==0.46.1
+streamlit==1.43.1
+sympy==1.13.1
+tenacity==9.0.0
+termcolor==2.5.0
+terminado==0.18.1
+tf_keras==2.18.0
+threadpoolctl==3.5.0
+tinycss2==1.4.0
+tokenizers==0.21.0
+toml==0.10.2
+tomli==2.2.1
+torch==2.6.0
+tornado==6.4.2
+tqdm==4.67.1
+traitlets==5.14.3
+transformers==4.49.0
+triton==3.2.0
+typer==0.15.2
+types-python-dateutil==2.9.0.20241206
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+tzdata==2025.1
+uri-template==1.3.0
+urllib3==2.3.0
+uvicorn==0.34.0
+uvloop==0.21.0
+watchdog==6.0.0
+watchfiles==1.0.4
+wcwidth==0.2.13
+webcolors==24.11.1
+webencodings==0.5.1
+websocket-client==1.8.0
+websockets==15.0.1
+Werkzeug==3.1.3
+widgetsnbextension==4.0.13
+wrapt==1.17.2
+yarl==1.18.3
+zipp==3.21.0
+zstandard==0.23.0

utils.py ADDED Viewed

	@@ -0,0 +1,320 @@

+# utils.py
+"""
+Financial Chatbot Utilities
+Core functionality for RAG-based financial chatbot
+"""
+import os
+import re
+import nltk
+from nltk.corpus import stopwords
+from collections import deque
+from typing import Tuple
+import torch
+import streamlit as st
+# LangChain components
+from langchain_community.document_loaders import PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import Chroma
+from langchain_huggingface import HuggingFaceEmbeddings
+# Models and ML
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+from rank_bm25 import BM25Okapi
+from sentence_transformers import CrossEncoder
+from sklearn.metrics.pairwise import cosine_similarity
+import sys
+sys.path.append('/mount/src/gen_ai_dev')
+# these three lines swap the stdlib sqlite3 lib with the pysqlite3 package
+import pysqlite3
+import sys
+sys.modules["sqlite3"] = pysqlite3
+__import__('pysqlite3')
+import sys
+sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
+# Initialize NLTK stopwords
+# nltk.download('stopwords')
+# stop_words = set(stopwords.words('english'))
+nltk.data.path.append('./nltk_data')  # Point to local NLTK data
+stop_words = set(nltk.corpus.stopwords.words('english'))
+# Configuration
+DATA_PATH = "./Infy financial report/"
+DATA_FILES = ["INFY_2022_2023.pdf", "INFY_2023_2024.pdf"]
+EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
+LLM_MODEL = "HuggingFaceH4/zephyr-7b-beta" #"microsoft/phi-2"
+# Environment settings
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+os.environ["CHROMA_DISABLE_TELEMETRY"] = "true"
+# Suppress specific warnings
+import warnings
+warnings.filterwarnings("ignore", message=".*oneDNN custom operations.*")
+warnings.filterwarnings("ignore", message=".*cuBLAS factory.*")
+# ------------------------------
+# Load and Chunk Documents
+# ------------------------------
+def load_and_chunk_documents():
+    """Load and split PDF documents into manageable chunks"""
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=500,
+        chunk_overlap=100,
+        separators=["\n\n", "\n", ".", " ", ""]
+    )
+    all_chunks = []
+    for file in DATA_FILES:
+        try:
+            loader = PyPDFLoader(os.path.join(DATA_PATH, file))
+            pages = loader.load()
+            all_chunks.extend(text_splitter.split_documents(pages))
+        except Exception as e:
+            print(f"Error loading {file}: {e}")
+    return all_chunks
+# ------------------------------
+# Vector Store and Search Setup
+# ------------------------------
+text_chunks = load_and_chunk_documents()
+embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
+@st.cache_resource(show_spinner=False)
+def load_vector_db():
+    # Load and chunk documents
+    text_chunks = load_and_chunk_documents()
+    # Initialize embeddings
+    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
+    # Create and return Chroma vector store
+    return Chroma.from_documents(
+        documents=text_chunks,
+        embedding=embeddings,
+        persist_directory="./chroma_db"
+    )
+# Initialize vector_db
+vector_db = load_vector_db()
+# BM25 setup
+bm25_corpus = [chunk.page_content for chunk in text_chunks]
+bm25_tokenized = [doc.split() for doc in bm25_corpus]
+bm25 = BM25Okapi(bm25_tokenized)
+# Cross-encoder for re-ranking
+cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
+# ------------------------------
+# Conversation Memory
+# ------------------------------
+class ConversationMemory:
+    """Stores recent conversation context"""
+    def __init__(self, max_size=5):
+        self.buffer = deque(maxlen=max_size)
+    def add_interaction(self, query: str, response: str) -> None:
+        self.buffer.append((query, response))
+    def get_context(self) -> str:
+        return "\n".join(
+            [f"Previous Q: {q}\nPrevious A: {r}" for q, r in self.buffer]
+        )
+memory = ConversationMemory(max_size=3)
+# ------------------------------
+# Hybrid Retrieval System
+# ------------------------------
+def hybrid_retrieval(query: str, top_k: int = 5) -> str:
+    try:
+        # Semantic search
+        semantic_results = vector_db.similarity_search(query, k=top_k * 2)
+        print(f"\n\n[For Debug Only] Semantic Results: {semantic_results}")
+        # Keyword search
+        keyword_results = bm25.get_top_n(query.split(), bm25_corpus, n=top_k * 2)
+        print(f"\n\n[For Debug Only] Keyword Results: {keyword_results}\n\n")
+        # Combine and deduplicate results
+        combined = []
+        seen = set()
+        for doc in semantic_results:
+            content = doc.page_content
+            if content not in seen:
+                combined.append((content, "semantic"))
+                seen.add(content)
+        for doc in keyword_results:
+            if doc not in seen:
+                combined.append((doc, "keyword"))
+                seen.add(doc)
+        # Re-rank results using cross-encoder
+        pairs = [(query, content) for content, _ in combined]
+        scores = cross_encoder.predict(pairs)
+        # Sort by scores
+        sorted_results = sorted(
+            zip(combined, scores),
+            key=lambda x: x[1],
+            reverse=True
+        )
+        final_results = [f"[{source}] {content}" for (content, source), _ in sorted_results[:top_k]]
+        memory_context = memory.get_context()
+        if memory_context:
+            final_results.append(f"[memory] {memory_context}")
+        return "\n\n".join(final_results)
+    except Exception as e:
+        print(f"Retrieval error: {e}")
+        return ""
+# ------------------------------
+# Safety Guardrails
+# ------------------------------
+class SafetyGuard:
+    """Validates input and filters output"""
+    def __init__(self):
+        self.financial_terms = {
+            'revenue', 'profit', 'ebitda', 'balance', 'cash',
+            'income', 'fiscal', 'growth', 'margin', 'expense'
+        }
+        self.blocked_topics = {
+            'politics', 'sports', 'entertainment', 'religion',
+            'medical', 'hypothetical', 'opinion', 'personal'
+        }
+    def validate_input(self, query: str) -> Tuple[bool, str]:
+        query_lower = query.lower()
+        if any(topic in query_lower for topic in self.blocked_topics):
+            return False, "I only discuss financial topics."
+        # if not any(term in query_lower for term in self.financial_terms):
+        #     return False, "Please ask financial questions."
+        return True, ""
+    def filter_output(self, response: str) -> str:
+        phrases_to_remove = {
+            "I'm not sure", "I don't know", "maybe",
+            "possibly", "could be", "uncertain", "perhaps"
+        }
+        for phrase in phrases_to_remove:
+            response = response.replace(phrase, "")
+        sentences = re.split(r'[.!?]', response)
+        if len(sentences) > 2:
+            response = '. '.join(sentences[:2]) + '.'
+        return response.strip()
+guard = SafetyGuard()
+# ------------------------------
+# LLM Initialization
+# ------------------------------
+try:
+    @st.cache_resource(show_spinner=False)
+    def load_generator():
+        tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
+        if torch.cuda.is_available():
+            model = AutoModelForCausalLM.from_pretrained(
+                LLM_MODEL,
+                device_map="auto",
+                torch_dtype=torch.bfloat16,
+                load_in_4bit=True
+            )
+        else:
+            model = AutoModelForCausalLM.from_pretrained(
+                LLM_MODEL,
+                device_map="cpu",
+                torch_dtype=torch.float32
+            )
+        return pipeline(
+            "text-generation",
+            model=model,
+            tokenizer=tokenizer,
+            max_new_tokens=400,
+            do_sample=True,
+            temperature=0.3,
+            top_k=30,
+            top_p=0.9,
+            repetition_penalty=1.2
+        )
+    # Later in your generate_answer function:
+    generator = load_generator()
+except Exception as e:
+    print(f"Error loading model: {e}")
+    raise
+# ------------------------------
+# Response Generation
+# ------------------------------
+def extract_final_response(full_response: str) -> str:
+    parts = full_response.split("<|im_start|>assistant")
+    if len(parts) > 1:
+        response = parts[-1].split("<|im_end|>")[0]
+        return re.sub(r'\s+', ' ', response).strip()
+    return full_response
+def generate_answer(query: str) -> Tuple[str, float]:
+    try:
+        # Input validation
+        is_valid, msg = guard.validate_input(query)
+        if not is_valid:
+            return msg, 0.0
+        # Retrieve context
+        context = hybrid_retrieval(query)
+        # Generate response
+        prompt = f"""<|im_start|>system
+You are a financial analyst. Provide a brief answer using the context.
+Context: {context}<|im_end|>
+<|im_start|>user
+{query}<|im_end|>
+<|im_start|>assistant
+Answer:"""
+        response = generator(prompt)[0]['generated_text']
+        clean_response = extract_final_response(response)
+        clean_response = guard.filter_output(clean_response)
+        # Calculate confidence
+        query_embed = embeddings.embed_query(query)
+        response_embed = embeddings.embed_query(clean_response)
+        confidence = cosine_similarity([query_embed], [response_embed])[0][0]
+        # Update memory
+        memory.add_interaction(query, clean_response)
+        return clean_response, round(confidence, 2)
+    except Exception as e:
+        return f"Error processing request: {e}", 0.0