Spaces:

Abhilashvj
/

haystack_QA

Runtime error

App Files Files Community

abhi001vj commited on Dec 29, 2022

Commit

5546ef7

1 Parent(s): 441daf5

added the fix for indexing docs

Browse files

Files changed (2) hide show

app.py +2 -2
app.py.6195269faeded9e54a105694d6915cf8.tmp +0 -295

app.py CHANGED Viewed

@@ -191,11 +191,11 @@ for data_file in data_files:
 if len(ALL_FILES) > 0:
     # document_store.update_embeddings(retriever, update_existing_embeddings=False)
-    docs = indexing_pipeline_with_classification.run(file_paths=ALL_FILES, meta=META_DATA)[""]
     index_name = "qa_demo"
     # we will use batches of 64
     batch_size = 64
-    docs  = docs['documents']
     with st.spinner(
             "🧠 &nbsp;&nbsp; Performing indexing of uplaoded documents... \n "
         ):

 if len(ALL_FILES) > 0:
     # document_store.update_embeddings(retriever, update_existing_embeddings=False)
+    docs = indexing_pipeline_with_classification.run(file_paths=ALL_FILES, meta=META_DATA)["documents"]
     index_name = "qa_demo"
     # we will use batches of 64
     batch_size = 64
+    # docs  = docs['documents']
     with st.spinner(
             "🧠 &nbsp;&nbsp; Performing indexing of uplaoded documents... \n "
         ):

app.py.6195269faeded9e54a105694d6915cf8.tmp DELETED Viewed

@@ -1,295 +0,0 @@
-import json
-import logging
-import os
-import shutil
-import sys
-import uuid
-from json import JSONDecodeError
-from pathlib import Path
-import pandas as pd
-import pinecone
-import streamlit as st
-from annotated_text import annotation
-from haystack import Document
-from haystack.document_stores import PineconeDocumentStore
-from haystack.nodes import (
-    DocxToTextConverter,
-    EmbeddingRetriever,
-    FARMReader,
-    FileTypeClassifier,
-    PDFToTextConverter,
-    PreProcessor,
-    TextConverter,
-)
-from haystack.pipelines import ExtractiveQAPipeline, Pipeline
-from markdown import markdown
-from sentence_transformers import SentenceTransformer
-index_name = "qa_demo"
-# connect to pinecone environment
-pinecone.init(
-    api_key=st.secrets["pinecone_apikey"],
-    # environment="us-west1-gcp"
-)
-index_name = "qa-demo"
-preprocessor = PreProcessor(
-    clean_empty_lines=True,
-    clean_whitespace=True,
-    clean_header_footer=False,
-    split_by="word",
-    split_length=100,
-    split_respect_sentence_boundary=True
-)
-file_type_classifier = FileTypeClassifier()
-text_converter = TextConverter()
-pdf_converter = PDFToTextConverter()
-docx_converter = DocxToTextConverter()
-# check if the abstractive-question-answering index exists
-if index_name not in pinecone.list_indexes():
-    # create the index if it does not exist
-    pinecone.create_index(
-        index_name,
-        dimension=768,
-        metric="cosine"
-    )
-# connect to abstractive-question-answering index we created
-index = pinecone.Index(index_name)
-FILE_UPLOAD_PATH= "./data/uploads/"
-os.makedirs(FILE_UPLOAD_PATH, exist_ok=True)
-# @st.cache
-def create_doc_store():
-    document_store = PineconeDocumentStore(
-        api_key= st.secrets["pinecone_apikey"],
-        index=index_name,
-        similarity="cosine",
-        embedding_dim=768
-    )
-    return document_store
-# @st.cache
-# def create_pipe(document_store):
-    # retriever = EmbeddingRetriever(
-    # document_store=document_store,
-    # embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
-    # model_format="sentence_transformers",
-    # )
-    # reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
-    # pipe = ExtractiveQAPipeline(reader, retriever)
-    # return pipe
-def query(pipe, question, top_k_reader, top_k_retriever):
-    res = pipe.run(
-        query=question, params={"Retriever": {"top_k": top_k_retriever}, "Reader": {"top_k": top_k_reader}}
-    )
-    answer_df = []
-    # for r in res['answers']:
-    #     ans_dict = res['answers'][0].meta
-    #     ans_dict["answer"] = r.context
-    #     answer_df.append(ans_dict)
-    # result = pd.DataFrame(answer_df)
-    # result.columns = ["Source","Title","Year","Link","Answer"]
-    # result[["Answer","Link","Source","Title","Year"]]
-    return res
-document_store = create_doc_store()
-# pipe = create_pipe(document_store)
-retriever_model = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
-retriever = EmbeddingRetriever(
-document_store=document_store,
-embedding_model=retriever_model,
-model_format="sentence_transformers",
-)
-# load the retriever model from huggingface model hub
-sentence_encoder = SentenceTransformer(retriever_model)
-reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
-pipe = ExtractiveQAPipeline(reader, retriever)
-indexing_pipeline_with_classification = Pipeline()
-indexing_pipeline_with_classification.add_node(
-    component=file_type_classifier, name="FileTypeClassifier", inputs=["File"]
-)
-indexing_pipeline_with_classification.add_node(
-    component=text_converter, name="TextConverter", inputs=["FileTypeClassifier.output_1"]
-)
-indexing_pipeline_with_classification.add_node(
-    component=pdf_converter, name="PdfConverter", inputs=["FileTypeClassifier.output_2"]
-)
-indexing_pipeline_with_classification.add_node(
-    component=docx_converter, name="DocxConverter", inputs=["FileTypeClassifier.output_4"]
-)
-indexing_pipeline_with_classification.add_node(
-    component=preprocessor,
-    name="Preprocessor",
-    inputs=["TextConverter", "PdfConverter", "DocxConverter"],
-)
-def set_state_if_absent(key, value):
-    if key not in st.session_state:
-        st.session_state[key] = value
-# Adjust to a question that you would like users to see in the search bar when they load the UI:
-DEFAULT_QUESTION_AT_STARTUP = os.getenv("DEFAULT_QUESTION_AT_STARTUP", "My blog post discusses remote work. Give me statistics.")
-DEFAULT_ANSWER_AT_STARTUP = os.getenv("DEFAULT_ANSWER_AT_STARTUP", "7% more remote workers have been at their current organization for 5 years or fewer")
-# Sliders
-DEFAULT_DOCS_FROM_RETRIEVER = int(os.getenv("DEFAULT_DOCS_FROM_RETRIEVER", "3"))
-DEFAULT_NUMBER_OF_ANSWERS = int(os.getenv("DEFAULT_NUMBER_OF_ANSWERS", "3"))
-st.set_page_config(page_title="Haystack Demo", page_icon="https://haystack.deepset.ai/img/HaystackIcon.png")
-# Persistent state
-set_state_if_absent("question", DEFAULT_QUESTION_AT_STARTUP)
-set_state_if_absent("answer", DEFAULT_ANSWER_AT_STARTUP)
-set_state_if_absent("results", None)
-# Small callback to reset the interface in case the text of the question changes
-def reset_results(*args):
-    st.session_state.answer = None
-    st.session_state.results = None
-    st.session_state.raw_json = None
-# Title
-st.write("# Haystack Search Demo")
-st.markdown(
-    """
-This demo takes its data from two sample data csv with statistics on various topics. \n
-Ask any question on this topic and see if Haystack can find the correct answer to your query! \n
-*Note: do not use keywords, but full-fledged questions.* The demo is not optimized to deal with keyword queries and might misunderstand you.
-""",
-    unsafe_allow_html=True,
-)
-# Sidebar
-st.sidebar.header("Options")
-st.sidebar.write("## File Upload:")
-data_files = st.sidebar.file_uploader(
-    "upload", type=["pdf", "txt", "docx"], accept_multiple_files=True, label_visibility="hidden"
-)
-ALL_FILES = []
-META_DATA = []
-for data_file in data_files:
-    # Upload file
-    if data_file:
-        file_path = Path(FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{data_file.name}"
-        with open(file_path, "wb") as f:
-            f.write(data_file.getbuffer())
-        ALL_FILES.append(file_path)
-        st.sidebar.write(str(data_file.name) + " &nbsp;&nbsp; ✅ ")
-        META_DATA.append({"filename":data_file.name})
-if len(ALL_FILES) > 0:
-    # document_store.update_embeddings(retriever, update_existing_embeddings=False)
-    docs = indexing_pipeline_with_classification.run(file_paths=ALL_FILES, meta=META_DATA)["documents"]
-    index_name = "qa_demo"
-    # we will use batches of 64
-    batch_size = 64
-    # docs  = docs['documents']
-    with st.spinner(
-            "🧠 &nbsp;&nbsp; Performing indexing of uplaoded documents... \n "
-        ):
-        for i in range(0, len(docs), batch_size):
-            # find end of batch
-            i_end = min(i+batch_size, len(docs))
-            # extract batch
-            batch = [doc.content for doc in docs[i:i_end]]
-            # generate embeddings for batch
-            emb = retriever.encode(batch).tolist()
-            # get metadata
-            meta = [doc.meta for doc in docs[i:i_end]]
-            # create unique IDs
-            ids = [doc.id for doc in docs[i:i_end]]
-            # add all to upsert list
-            to_upsert = list(zip(ids, emb, meta))
-            # upsert/insert these records to pinecone
-            _ = index.upsert(vectors=to_upsert)
-top_k_reader = st.sidebar.slider(
-    "Max. number of answers",
-    min_value=1,
-    max_value=10,
-    value=DEFAULT_NUMBER_OF_ANSWERS,
-    step=1,
-    on_change=reset_results,
-)
-top_k_retriever = st.sidebar.slider(
-    "Max. number of documents from retriever",
-    min_value=1,
-    max_value=10,
-    value=DEFAULT_DOCS_FROM_RETRIEVER,
-    step=1,
-    on_change=reset_results,
-)
-# data_files = st.file_uploader(
-#         "upload", type=["csv"], accept_multiple_files=True, label_visibility="hidden"
-#     )
-# for data_file in data_files:
-#     # Upload file
-#     if data_file:
-#         raw_json = upload_doc(data_file)
-question = st.text_input(
-        value=st.session_state.question,
-        max_chars=100,
-        on_change=reset_results,
-        label="question",
-        label_visibility="hidden",
-    )
-col1, col2 = st.columns(2)
-col1.markdown("<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
-col2.markdown("<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
-# Run button
-run_pressed = col1.button("Run")
-if run_pressed:
-    run_query = (
-        run_pressed or question != st.session_state.question
-    )
-    # Get results for query
-    if run_query and question:
-        reset_results()
-        st.session_state.question = question
-        with st.spinner(
-            "🧠 &nbsp;&nbsp; Performing neural search on documents... \n "
-        ):
-            try:
-                st.session_state.results  = query(
-                    pipe, question, top_k_reader=top_k_reader, top_k_retriever=top_k_retriever
-                )
-            except JSONDecodeError as je:
-                st.error("👓 &nbsp;&nbsp; An error occurred reading the results. Is the document store working?")
-            except Exception as e:
-                logging.exception(e)
-                if "The server is busy processing requests" in str(e) or "503" in str(e):
-                    st.error("🧑‍🌾 &nbsp;&nbsp; All our workers are busy! Try again later.")
-                else:
-                    st.error(f"🐞 &nbsp;&nbsp; An error occurred during the request. {str(e)}")
-if st.session_state.results:
-    st.write("## Results:")
-    for count, result in enumerate(st.session_state.results['answers']):
-        answer, context = result.answer, result.context
-        start_idx = context.find(answer)
-        end_idx = start_idx + len(answer)
-        source = f"[{result.meta['Title']}]({result.meta['link']})"
-        # Hack due to this bug: https://github.com/streamlit/streamlit/issues/3190
-        st.write(
-            markdown(f'**Source:** {source} \n {context[:start_idx] } {str(annotation(answer, "ANSWER", "#8ef"))} {context[end_idx:]} \n '),
-            unsafe_allow_html=True,
-        )