rolwinpinto commited on
Commit
e53d8c9
·
verified ·
1 Parent(s): 3ba3681

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -5
app.py CHANGED
@@ -3,7 +3,8 @@ import streamlit as st
3
  import PyPDF2
4
  import matplotlib.pyplot as plt
5
  from io import BytesIO
6
- from sentence_transformers import SentenceTransformer
 
7
  from sklearn.metrics.pairwise import cosine_similarity
8
  import numpy as np
9
  import dotenv
@@ -17,8 +18,8 @@ dotenv.load_dotenv()
17
  API_URL = "https://api-inference.huggingface.co/models/sarvamai/sarvam-2b-v0.5"
18
  headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_KEY')}"}
19
 
20
- # Initialize SentenceTransformer model
21
- embed_model = SentenceTransformer('all-MiniLM-L6-v2')
22
 
23
  def query_huggingface_api(payload):
24
  response = requests.post(API_URL, headers=headers, json=payload)
@@ -102,9 +103,12 @@ def search_similar_sections(document_text, query, top_k=3):
102
  # Split the document into sections (you may need to adjust this based on your document structure)
103
  sections = document_text.split('\n\n')
104
 
 
 
 
105
  # Compute embeddings for the query and all sections
106
- query_embedding = embed_model.encode([query])[0]
107
- section_embeddings = embed_model.encode(sections)
108
 
109
  # Compute cosine similarities
110
  similarities = cosine_similarity([query_embedding], section_embeddings)[0]
 
3
  import PyPDF2
4
  import matplotlib.pyplot as plt
5
  from io import BytesIO
6
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
7
+ from llama_index import Document
8
  from sklearn.metrics.pairwise import cosine_similarity
9
  import numpy as np
10
  import dotenv
 
18
  API_URL = "https://api-inference.huggingface.co/models/sarvamai/sarvam-2b-v0.5"
19
  headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_KEY')}"}
20
 
21
+ # Configure embedding model
22
+ embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
23
 
24
  def query_huggingface_api(payload):
25
  response = requests.post(API_URL, headers=headers, json=payload)
 
103
  # Split the document into sections (you may need to adjust this based on your document structure)
104
  sections = document_text.split('\n\n')
105
 
106
+ # Create Document objects for each section
107
+ documents = [Document(text=section) for section in sections]
108
+
109
  # Compute embeddings for the query and all sections
110
+ query_embedding = embed_model.get_text_embedding(query)
111
+ section_embeddings = [embed_model.get_text_embedding(doc.text) for doc in documents]
112
 
113
  # Compute cosine similarities
114
  similarities = cosine_similarity([query_embedding], section_embeddings)[0]