abhi001vj commited on
Commit
d870af3
·
1 Parent(s): 2c560b7

updated the app

Browse files
Files changed (1) hide show
  1. app.py +50 -8
app.py CHANGED
@@ -18,6 +18,10 @@ import uuid
18
  from pathlib import Path
19
  from haystack.pipelines import Pipeline
20
  from haystack.nodes import TextConverter, PreProcessor, FileTypeClassifier, PDFToTextConverter, DocxToTextConverter
 
 
 
 
21
 
22
 
23
  preprocessor = PreProcessor(
@@ -33,6 +37,17 @@ text_converter = TextConverter()
33
  pdf_converter = PDFToTextConverter()
34
  docx_converter = DocxToTextConverter()
35
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  FILE_UPLOAD_PATH= "./data/uploads/"
38
  os.makedirs(FILE_UPLOAD_PATH, exist_ok=True)
@@ -40,7 +55,7 @@ os.makedirs(FILE_UPLOAD_PATH, exist_ok=True)
40
  def create_doc_store():
41
  document_store = PineconeDocumentStore(
42
  api_key= st.secrets["pinecone_apikey"],
43
- index='qa_demo',
44
  similarity="cosine",
45
  embedding_dim=768
46
  )
@@ -73,14 +88,19 @@ def query(pipe, question, top_k_reader, top_k_retriever):
73
 
74
  document_store = create_doc_store()
75
  # pipe = create_pipe(document_store)
 
76
  retriever = EmbeddingRetriever(
77
  document_store=document_store,
78
- embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
79
  model_format="sentence_transformers",
80
  )
 
 
 
81
  reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
82
  pipe = ExtractiveQAPipeline(reader, retriever)
83
 
 
84
  indexing_pipeline_with_classification = Pipeline()
85
  indexing_pipeline_with_classification.add_node(
86
  component=file_type_classifier, name="FileTypeClassifier", inputs=["File"]
@@ -99,9 +119,6 @@ indexing_pipeline_with_classification.add_node(
99
  name="Preprocessor",
100
  inputs=["TextConverter", "PdfConverter", "DocxConverter"],
101
  )
102
- indexing_pipeline_with_classification.add_node(
103
- component=document_store, name="DocumentStore", inputs=["Preprocessor"]
104
- )
105
 
106
  def set_state_if_absent(key, value):
107
  if key not in st.session_state:
@@ -148,6 +165,7 @@ data_files = st.sidebar.file_uploader(
148
  "upload", type=["pdf", "txt", "docx"], accept_multiple_files=True, label_visibility="hidden"
149
  )
150
  ALL_FILES = []
 
151
  for data_file in data_files:
152
  # Upload file
153
  if data_file:
@@ -156,11 +174,35 @@ for data_file in data_files:
156
  shutil.copyfileobj(data_file.file, buffer)
157
  ALL_FILES.append(file_path)
158
  st.sidebar.write(str(data_file.name) + "    ✅ ")
159
- indexing_pipeline_with_classification.run(file_paths=ALL_FILES)
 
160
 
161
  if len(ALL_FILES) > 0:
162
- document_store.update_embeddings(retriever, update_existing_embeddings=False)
163
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  top_k_reader = st.sidebar.slider(
165
  "Max. number of answers",
166
  min_value=1,
 
18
  from pathlib import Path
19
  from haystack.pipelines import Pipeline
20
  from haystack.nodes import TextConverter, PreProcessor, FileTypeClassifier, PDFToTextConverter, DocxToTextConverter
21
+ from sentence_transformers import SentenceTransformer
22
+ import pinecone
23
+ index_name = "qa_demo"
24
+
25
 
26
 
27
  preprocessor = PreProcessor(
 
37
  pdf_converter = PDFToTextConverter()
38
  docx_converter = DocxToTextConverter()
39
 
40
+ # check if the abstractive-question-answering index exists
41
+ if index_name not in pinecone.list_indexes():
42
+ # create the index if it does not exist
43
+ pinecone.create_index(
44
+ index_name,
45
+ dimension=768,
46
+ metric="cosine"
47
+ )
48
+
49
+ # connect to abstractive-question-answering index we created
50
+ index = pinecone.Index(index_name)
51
 
52
  FILE_UPLOAD_PATH= "./data/uploads/"
53
  os.makedirs(FILE_UPLOAD_PATH, exist_ok=True)
 
55
  def create_doc_store():
56
  document_store = PineconeDocumentStore(
57
  api_key= st.secrets["pinecone_apikey"],
58
+ index=index_name,
59
  similarity="cosine",
60
  embedding_dim=768
61
  )
 
88
 
89
  document_store = create_doc_store()
90
  # pipe = create_pipe(document_store)
91
+ retriever_model = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
92
  retriever = EmbeddingRetriever(
93
  document_store=document_store,
94
+ embedding_model=retriever_model",
95
  model_format="sentence_transformers",
96
  )
97
+ # load the retriever model from huggingface model hub
98
+ sentence_encoder = SentenceTransformer(retriever_model)
99
+
100
  reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
101
  pipe = ExtractiveQAPipeline(reader, retriever)
102
 
103
+
104
  indexing_pipeline_with_classification = Pipeline()
105
  indexing_pipeline_with_classification.add_node(
106
  component=file_type_classifier, name="FileTypeClassifier", inputs=["File"]
 
119
  name="Preprocessor",
120
  inputs=["TextConverter", "PdfConverter", "DocxConverter"],
121
  )
 
 
 
122
 
123
  def set_state_if_absent(key, value):
124
  if key not in st.session_state:
 
165
  "upload", type=["pdf", "txt", "docx"], accept_multiple_files=True, label_visibility="hidden"
166
  )
167
  ALL_FILES = []
168
+ META_DATA = []
169
  for data_file in data_files:
170
  # Upload file
171
  if data_file:
 
174
  shutil.copyfileobj(data_file.file, buffer)
175
  ALL_FILES.append(file_path)
176
  st.sidebar.write(str(data_file.name) + "    ✅ ")
177
+ META_DATA.append({"filename":data_file.name})
178
+
179
 
180
  if len(ALL_FILES) > 0:
181
+ # document_store.update_embeddings(retriever, update_existing_embeddings=False)
182
+ docs = indexing_pipeline_with_classification.run(file_paths=ALL_FILES, meta=META_DATA)[""]
183
+ index_name = "qa_demo"
184
+ # we will use batches of 64
185
+ batch_size = 64
186
+ docs = docs['documents']
187
+ with st.spinner(
188
+ "🧠    Performing indexing of uplaoded documents... \n "
189
+ ):
190
+ for i range(0, len(docs), batch_size):
191
+ # find end of batch
192
+ i_end = min(i+batch_size, len(docs))
193
+ # extract batch
194
+ batch = [doc.content for doc in docs[i:i_end]]
195
+ # generate embeddings for batch
196
+ emb = retriever.encode(batch).tolist()
197
+ # get metadata
198
+ meta = [doc.meta for doc in docs[i:i_end]]
199
+ # create unique IDs
200
+ ids = [doc.id for doc in docs[i:i_end]]
201
+ # add all to upsert list
202
+ to_upsert = list(zip(ids, emb, meta))
203
+ # upsert/insert these records to pinecone
204
+ _ = index.upsert(vectors=to_upsert)
205
+
206
  top_k_reader = st.sidebar.slider(
207
  "Max. number of answers",
208
  min_value=1,