Spaces:
Runtime error
Runtime error
abhi001vj
commited on
Commit
·
d870af3
1
Parent(s):
2c560b7
updated the app
Browse files
app.py
CHANGED
@@ -18,6 +18,10 @@ import uuid
|
|
18 |
from pathlib import Path
|
19 |
from haystack.pipelines import Pipeline
|
20 |
from haystack.nodes import TextConverter, PreProcessor, FileTypeClassifier, PDFToTextConverter, DocxToTextConverter
|
|
|
|
|
|
|
|
|
21 |
|
22 |
|
23 |
preprocessor = PreProcessor(
|
@@ -33,6 +37,17 @@ text_converter = TextConverter()
|
|
33 |
pdf_converter = PDFToTextConverter()
|
34 |
docx_converter = DocxToTextConverter()
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
FILE_UPLOAD_PATH= "./data/uploads/"
|
38 |
os.makedirs(FILE_UPLOAD_PATH, exist_ok=True)
|
@@ -40,7 +55,7 @@ os.makedirs(FILE_UPLOAD_PATH, exist_ok=True)
|
|
40 |
def create_doc_store():
|
41 |
document_store = PineconeDocumentStore(
|
42 |
api_key= st.secrets["pinecone_apikey"],
|
43 |
-
index=
|
44 |
similarity="cosine",
|
45 |
embedding_dim=768
|
46 |
)
|
@@ -73,14 +88,19 @@ def query(pipe, question, top_k_reader, top_k_retriever):
|
|
73 |
|
74 |
document_store = create_doc_store()
|
75 |
# pipe = create_pipe(document_store)
|
|
|
76 |
retriever = EmbeddingRetriever(
|
77 |
document_store=document_store,
|
78 |
-
embedding_model="
|
79 |
model_format="sentence_transformers",
|
80 |
)
|
|
|
|
|
|
|
81 |
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
|
82 |
pipe = ExtractiveQAPipeline(reader, retriever)
|
83 |
|
|
|
84 |
indexing_pipeline_with_classification = Pipeline()
|
85 |
indexing_pipeline_with_classification.add_node(
|
86 |
component=file_type_classifier, name="FileTypeClassifier", inputs=["File"]
|
@@ -99,9 +119,6 @@ indexing_pipeline_with_classification.add_node(
|
|
99 |
name="Preprocessor",
|
100 |
inputs=["TextConverter", "PdfConverter", "DocxConverter"],
|
101 |
)
|
102 |
-
indexing_pipeline_with_classification.add_node(
|
103 |
-
component=document_store, name="DocumentStore", inputs=["Preprocessor"]
|
104 |
-
)
|
105 |
|
106 |
def set_state_if_absent(key, value):
|
107 |
if key not in st.session_state:
|
@@ -148,6 +165,7 @@ data_files = st.sidebar.file_uploader(
|
|
148 |
"upload", type=["pdf", "txt", "docx"], accept_multiple_files=True, label_visibility="hidden"
|
149 |
)
|
150 |
ALL_FILES = []
|
|
|
151 |
for data_file in data_files:
|
152 |
# Upload file
|
153 |
if data_file:
|
@@ -156,11 +174,35 @@ for data_file in data_files:
|
|
156 |
shutil.copyfileobj(data_file.file, buffer)
|
157 |
ALL_FILES.append(file_path)
|
158 |
st.sidebar.write(str(data_file.name) + " ✅ ")
|
159 |
-
|
|
|
160 |
|
161 |
if len(ALL_FILES) > 0:
|
162 |
-
document_store.update_embeddings(retriever, update_existing_embeddings=False)
|
163 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
top_k_reader = st.sidebar.slider(
|
165 |
"Max. number of answers",
|
166 |
min_value=1,
|
|
|
18 |
from pathlib import Path
|
19 |
from haystack.pipelines import Pipeline
|
20 |
from haystack.nodes import TextConverter, PreProcessor, FileTypeClassifier, PDFToTextConverter, DocxToTextConverter
|
21 |
+
from sentence_transformers import SentenceTransformer
|
22 |
+
import pinecone
|
23 |
+
index_name = "qa_demo"
|
24 |
+
|
25 |
|
26 |
|
27 |
preprocessor = PreProcessor(
|
|
|
37 |
pdf_converter = PDFToTextConverter()
|
38 |
docx_converter = DocxToTextConverter()
|
39 |
|
40 |
+
# check if the abstractive-question-answering index exists
|
41 |
+
if index_name not in pinecone.list_indexes():
|
42 |
+
# create the index if it does not exist
|
43 |
+
pinecone.create_index(
|
44 |
+
index_name,
|
45 |
+
dimension=768,
|
46 |
+
metric="cosine"
|
47 |
+
)
|
48 |
+
|
49 |
+
# connect to abstractive-question-answering index we created
|
50 |
+
index = pinecone.Index(index_name)
|
51 |
|
52 |
FILE_UPLOAD_PATH= "./data/uploads/"
|
53 |
os.makedirs(FILE_UPLOAD_PATH, exist_ok=True)
|
|
|
55 |
def create_doc_store():
|
56 |
document_store = PineconeDocumentStore(
|
57 |
api_key= st.secrets["pinecone_apikey"],
|
58 |
+
index=index_name,
|
59 |
similarity="cosine",
|
60 |
embedding_dim=768
|
61 |
)
|
|
|
88 |
|
89 |
document_store = create_doc_store()
|
90 |
# pipe = create_pipe(document_store)
|
91 |
+
retriever_model = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
|
92 |
retriever = EmbeddingRetriever(
|
93 |
document_store=document_store,
|
94 |
+
embedding_model=retriever_model",
|
95 |
model_format="sentence_transformers",
|
96 |
)
|
97 |
+
# load the retriever model from huggingface model hub
|
98 |
+
sentence_encoder = SentenceTransformer(retriever_model)
|
99 |
+
|
100 |
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
|
101 |
pipe = ExtractiveQAPipeline(reader, retriever)
|
102 |
|
103 |
+
|
104 |
indexing_pipeline_with_classification = Pipeline()
|
105 |
indexing_pipeline_with_classification.add_node(
|
106 |
component=file_type_classifier, name="FileTypeClassifier", inputs=["File"]
|
|
|
119 |
name="Preprocessor",
|
120 |
inputs=["TextConverter", "PdfConverter", "DocxConverter"],
|
121 |
)
|
|
|
|
|
|
|
122 |
|
123 |
def set_state_if_absent(key, value):
|
124 |
if key not in st.session_state:
|
|
|
165 |
"upload", type=["pdf", "txt", "docx"], accept_multiple_files=True, label_visibility="hidden"
|
166 |
)
|
167 |
ALL_FILES = []
|
168 |
+
META_DATA = []
|
169 |
for data_file in data_files:
|
170 |
# Upload file
|
171 |
if data_file:
|
|
|
174 |
shutil.copyfileobj(data_file.file, buffer)
|
175 |
ALL_FILES.append(file_path)
|
176 |
st.sidebar.write(str(data_file.name) + " ✅ ")
|
177 |
+
META_DATA.append({"filename":data_file.name})
|
178 |
+
|
179 |
|
180 |
if len(ALL_FILES) > 0:
|
181 |
+
# document_store.update_embeddings(retriever, update_existing_embeddings=False)
|
182 |
+
docs = indexing_pipeline_with_classification.run(file_paths=ALL_FILES, meta=META_DATA)[""]
|
183 |
+
index_name = "qa_demo"
|
184 |
+
# we will use batches of 64
|
185 |
+
batch_size = 64
|
186 |
+
docs = docs['documents']
|
187 |
+
with st.spinner(
|
188 |
+
"🧠 Performing indexing of uplaoded documents... \n "
|
189 |
+
):
|
190 |
+
for i range(0, len(docs), batch_size):
|
191 |
+
# find end of batch
|
192 |
+
i_end = min(i+batch_size, len(docs))
|
193 |
+
# extract batch
|
194 |
+
batch = [doc.content for doc in docs[i:i_end]]
|
195 |
+
# generate embeddings for batch
|
196 |
+
emb = retriever.encode(batch).tolist()
|
197 |
+
# get metadata
|
198 |
+
meta = [doc.meta for doc in docs[i:i_end]]
|
199 |
+
# create unique IDs
|
200 |
+
ids = [doc.id for doc in docs[i:i_end]]
|
201 |
+
# add all to upsert list
|
202 |
+
to_upsert = list(zip(ids, emb, meta))
|
203 |
+
# upsert/insert these records to pinecone
|
204 |
+
_ = index.upsert(vectors=to_upsert)
|
205 |
+
|
206 |
top_k_reader = st.sidebar.slider(
|
207 |
"Max. number of answers",
|
208 |
min_value=1,
|