lfoppiano commited on
Commit
4108477
·
1 Parent(s): 135de1f

back to the huggingface API embeddings, remove non-usable embeddings

Browse files
document_qa/document_qa_engine.py CHANGED
@@ -423,7 +423,7 @@ class DocumentQAEngine:
423
  if doc_id:
424
  hash = doc_id
425
  else:
426
- hash = metadata[0]['hash']
427
 
428
  self.data_storage.embed_document(hash, texts, metadata)
429
 
 
423
  if doc_id:
424
  hash = doc_id
425
  else:
426
+ hash = metadata[0]['hash'] if len(metadata) > 0 and 'hash' in metadata[0] else ""
427
 
428
  self.data_storage.embed_document(hash, texts, metadata)
429
 
streamlit_app.py CHANGED
@@ -6,7 +6,7 @@ from tempfile import NamedTemporaryFile
6
  import dotenv
7
  from grobid_quantities.quantities import QuantitiesAPI
8
  from langchain.memory import ConversationBufferMemory
9
- from langchain_huggingface import HuggingFaceEmbeddings
10
  from langchain_openai import ChatOpenAI
11
  from streamlit_pdf_viewer import pdf_viewer
12
 
@@ -23,9 +23,7 @@ API_MODELS = {
23
  }
24
 
25
  API_EMBEDDINGS = {
26
- 'intfloat/e5-large-v2': 'intfloat/e5-large-v2',
27
- 'intfloat/multilingual-e5-large-instruct': 'intfloat/multilingual-e5-large-instruct:',
28
- 'Salesforce/SFR-Embedding-2_R': 'Salesforce/SFR-Embedding-2_R'
29
  }
30
 
31
  if 'rqa' not in st.session_state:
@@ -135,8 +133,9 @@ def init_qa(model_name, embeddings_name):
135
  api_key=os.environ.get('API_KEY')
136
  )
137
 
138
- embeddings = HuggingFaceEmbeddings(
139
- model_name=API_EMBEDDINGS[embeddings_name])
 
140
 
141
  storage = DataStorage(embeddings)
142
  return DocumentQAEngine(chat, storage, grobid_url=os.environ['GROBID_URL'], memory=st.session_state['memory'])
@@ -320,7 +319,8 @@ if uploaded_file and not st.session_state.loaded_embeddings:
320
  st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings(
321
  tmp_file.name,
322
  chunk_size=chunk_size,
323
- perc_overlap=0.1)
 
324
  st.session_state['loaded_embeddings'] = True
325
  st.session_state.messages = []
326
 
 
6
  import dotenv
7
  from grobid_quantities.quantities import QuantitiesAPI
8
  from langchain.memory import ConversationBufferMemory
9
+ from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpointEmbeddings
10
  from langchain_openai import ChatOpenAI
11
  from streamlit_pdf_viewer import pdf_viewer
12
 
 
23
  }
24
 
25
  API_EMBEDDINGS = {
26
+ 'intfloat/multilingual-e5-large-instruct': 'intfloat/multilingual-e5-large-instruct'
 
 
27
  }
28
 
29
  if 'rqa' not in st.session_state:
 
133
  api_key=os.environ.get('API_KEY')
134
  )
135
 
136
+ embeddings = HuggingFaceEndpointEmbeddings(
137
+ repo_id=API_EMBEDDINGS[embeddings_name]
138
+ )
139
 
140
  storage = DataStorage(embeddings)
141
  return DocumentQAEngine(chat, storage, grobid_url=os.environ['GROBID_URL'], memory=st.session_state['memory'])
 
319
  st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings(
320
  tmp_file.name,
321
  chunk_size=chunk_size,
322
+ perc_overlap=0.1
323
+ )
324
  st.session_state['loaded_embeddings'] = True
325
  st.session_state.messages = []
326