Spaces:

GIZ
/

SDSN-demo

Running on CPU Upgrade

App Files Files Community

ppsingh commited on Feb 4, 2024

Commit

b5e1233

verified ·

1 Parent(s): aa17801

Delete utils

Browse files

Files changed (10) hide show

utils/__init__.py +0 -1
utils/checkconfig.py +0 -15
utils/keyword_extraction.py +0 -140
utils/lexical_search.py +0 -251
utils/ndc_explorer.py +0 -90
utils/preprocessing.py +0 -260
utils/sdg_classifier.py +0 -177
utils/semantic_search.py +0 -582
utils/streamlitcheck.py +0 -42
utils/uploadAndExample.py +0 -33

utils/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- # adding for package implementation

utils/checkconfig.py DELETED Viewed

@@ -1,15 +0,0 @@
-import configparser
-import logging
-def getconfig(configfile_path:str):
-    """
-    configfile_path: file path of .cfg file
-    """
-    config = configparser.ConfigParser()
-    try:
-        config.read_file(open(configfile_path))
-        return config
-    except:
-        logging.warning("config file not found")

utils/keyword_extraction.py DELETED Viewed

@@ -1,140 +0,0 @@
-import pandas as pd
-# from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
-# import nltk
-# nltk.download('stopwords')
-# from nltk.corpus import stopwords
-import pickle
-from typing import List, Text
-import logging
-from summa import keywords
-try:
-    import streamlit as st
-except ImportError:
-    logging.info("Streamlit not installed")
-def sort_coo(coo_matrix):
-    """
-    It takes Coordinate format scipy sparse matrix and extracts info from same.\
-    1. https://kavita-ganesan.com/python-keyword-extraction/#.Y2-TFHbMJPb
-    """
-    tuples = zip(coo_matrix.col, coo_matrix.data)
-    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
-def extract_topn_from_vector(feature_names, sorted_items, top_n=10):
-    """get the feature names and tf-idf score of top n items
-    Params
-    ---------
-    feature_names: list of words from vectorizer
-    sorted_items: tuple returned by sort_coo function defined in  \
-    keyword_extraction.py
-    topn: topn words to be extracted using tfidf
-    Return
-    ----------
-    results: top extracted keywords
-    """
-    #use only topn items from vector
-    sorted_items = sorted_items[:top_n]
-    score_vals = []
-    feature_vals = []
-    # word index and corresponding tf-idf score
-    for idx, score in sorted_items:
-        #keep track of feature name and its corresponding score
-        score_vals.append(round(score, 3))
-        feature_vals.append(feature_names[idx])
-    results= {}
-    for idx in range(len(feature_vals)):
-        results[feature_vals[idx]]=score_vals[idx]
-    return results
-def tfidf_keyword(textdata:str, vectorizer, tfidfmodel, top_n):
-    """
-    TFIDF based keywords extraction
-    Params
-    ---------
-    vectorizer: trained cont vectorizer model
-    tfidfmodel: TFIDF Tranformer model
-    top_n: Top N keywords to be extracted
-    textdata: text data to which needs keyword extraction
-    Return
-    ----------
-    keywords: top extracted keywords
-    """
-    features = vectorizer.get_feature_names_out()
-    tf_idf_vector=tfidfmodel.transform(vectorizer.transform(textdata))
-    sorted_items=sort_coo(tf_idf_vector.tocoo())
-    results=extract_topn_from_vector(features,sorted_items,top_n)
-    keywords = [keyword for keyword in results]
-    return keywords
-def keyword_extraction(sdg:int,sdgdata:List[Text], top_n:int=10):
-    """
-    TFIDF based keywords extraction
-    Params
-    ---------
-    sdg: which sdg tfidf model to be used
-    sdgdata: text data to which needs keyword extraction
-    Return
-    ----------
-    keywords: top extracted keywords
-    """
-    model_path = "docStore/sdg{}/".format(sdg)
-    vectorizer = pickle.load(open(model_path+'vectorizer.pkl', 'rb'))
-    tfidfmodel = pickle.load(open(model_path+'tfidfmodel.pkl', 'rb'))
-    features = vectorizer.get_feature_names_out()
-    tf_idf_vector=tfidfmodel.transform(vectorizer.transform(sdgdata))
-    sorted_items=sort_coo(tf_idf_vector.tocoo())
-    top_n = top_n
-    results=extract_topn_from_vector(features,sorted_items,top_n)
-    keywords = [keyword for keyword in results]
-    return keywords
-@st.cache(allow_output_mutation=True)
-def textrank(textdata:Text, ratio:float = 0.1, words:int = 0)->List[str]:
-    """
-    wrappper function to perform textrank, uses either ratio or wordcount to
-    extract top keywords limited by words or ratio.
-    1. https://github.com/summanlp/textrank/blob/master/summa/keywords.py
-    Params
-    --------
-    textdata: text data to perform the textrank.
-    ratio: float to limit the number of keywords as proportion of total token \
-        in textdata
-    words: number of keywords to be extracted. Takes priority over ratio if \
-        Non zero. Howevr incase the pagerank returns lesser keywords than \
-        compared to fix value then ratio is used.
-    Return
-    --------
-    results: extracted keywords
-    """
-    if words == 0:
-        logging.info("Textrank using defulat ratio value = 0.1, as no words limit given")
-        results = keywords.keywords(textdata, ratio= ratio).split("\n")
-    else:
-        try:
-            results = keywords.keywords(textdata, words= words).split("\n")
-        except:
-            results = keywords.keywords(textdata, ratio = ratio).split("\n")
-    return results

utils/lexical_search.py DELETED Viewed

@@ -1,251 +0,0 @@
-from haystack.nodes import TfidfRetriever
-from haystack.document_stores import InMemoryDocumentStore
-import spacy
-import re
-from spacy.matcher import Matcher
-from markdown import markdown
-from annotated_text import annotation
-from haystack.schema import Document
-from typing import List, Text, Tuple
-from typing_extensions import Literal
-from utils.preprocessing import processingpipeline
-from utils.streamlitcheck import check_streamlit
-import logging
-try:
-    from termcolor import colored
-except:
-    pass
-try:
-    import streamlit as st
-except ImportError:
-    logging.info("Streamlit not installed")
-def runLexicalPreprocessingPipeline(file_name:str,file_path:str,
-                        split_by: Literal["sentence", "word"] = 'word',
-                        split_length:int = 80, split_overlap:int = 0,
-                        remove_punc:bool = False,)->List[Document]:
-    """
-    creates the pipeline and runs the preprocessing pipeline,
-    the params for pipeline are fetched from paramconfig. As lexical doesnt gets
-    affected by overlap, threfore split_overlap = 0 in default paramconfig and
-    split_by = word.
-    Params
-    ------------
-    file_name: filename, in case of streamlit application use
-    st.session_state['filename']
-    file_path: filepath, in case of streamlit application use
-    st.session_state['filepath']
-    split_by: document splitting strategy either as word or sentence
-    split_length: when synthetically creating the paragrpahs from document,
-                    it defines the length of paragraph.
-    split_overlap: Number of words or sentences that overlap when creating
-        the paragraphs. This is done as one sentence or 'some words' make sense
-        when  read in together with others. Therefore the overlap is used.
-    splititng of text.
-    removePunc: to remove all Punctuation including ',' and '.' or not
-    Return
-    --------------
-    List[Document]: When preprocessing pipeline is run, the output dictionary
-    has four objects. For the lexicaal search using TFIDFRetriever we
-    need to use the List of Haystack Document, which can be fetched by
-    key = 'documents' on output.
-    """
-    lexical_processing_pipeline = processingpipeline()
-    output_lexical_pre = lexical_processing_pipeline.run(file_paths = file_path,
-                            params= {"FileConverter": {"file_path": file_path, \
-                                        "file_name": file_name},
-                                        "UdfPreProcessor": {"remove_punc": remove_punc, \
-                                            "split_by": split_by, \
-                                            "split_length":split_length,\
-                                            "split_overlap": split_overlap}})
-    return output_lexical_pre
-def tokenize_lexical_query(query:str)-> List[str]:
-    """
-    Removes the stop words from query and returns the list of important keywords
-    in query. For the lexical search the relevent paragraphs in document are
-    retreived using TfIDFretreiver from Haystack. However to highlight these
-    keywords we need the tokenized form of query.
-    Params
-    --------
-    query: string which represents either list of keywords user is looking for
-            or a query in form of Question.
-    Return
-    -----------
-    token_list: list of important keywords in the query.
-    """
-    nlp = spacy.load("en_core_web_sm")
-    token_list = [token.text.lower() for token in nlp(query)
-                  if not (token.is_stop or token.is_punct)]
-    return token_list
-def runSpacyMatcher(token_list:List[str], document:Text
-                    )->Tuple[List[List[int]],spacy.tokens.doc.Doc]:
-    """
-    Using the spacy in backend finds the keywords in the document using the
-    Matcher class from spacy. We can alternatively use the regex, but spacy
-    finds all keywords in serialized manner which helps in annotation of answers.
-    Params
-    -------
-    token_list: this is token list which tokenize_lexical_query function returns
-    document: text in which we need to find the tokens
-    Return
-    --------
-    matches: List of [start_index, end_index] in the spacydoc(at word level not
-    character) for the keywords in token list.
-    spacydoc: the keyword index in the spacydoc are at word level and not character,
-    therefore to allow the annotator to work seamlessly we return the spacydoc.
-    """
-    nlp = spacy.load("en_core_web_sm")
-    spacydoc = nlp(document)
-    matcher = Matcher(nlp.vocab)
-    token_pattern = [[{"LOWER":token}] for token in token_list]
-    matcher.add(",".join(token_list), token_pattern)
-    spacymatches = matcher(spacydoc)
-    # getting start and end index in spacydoc so that annotator can work seamlessly
-    matches = []
-    for match_id, start, end in spacymatches:
-        matches = matches + [[start, end]]
-    return matches, spacydoc
-def runRegexMatcher(token_list:List[str], document:Text):
-    """
-    Using the regex in backend finds the keywords in the document.
-    Params
-    -------
-    token_list: this is token list which tokenize_lexical_query function returns
-    document: text in which we need to find the tokens
-    Return
-    --------
-    matches: List of [start_index, end_index] in the document for the keywords
-    in token list at character level.
-    document: the keyword index returned by regex are at character level,
-    therefore to allow the annotator to work seamlessly we return the text back.
-    """
-    matches = []
-    for token in token_list:
-        matches = (matches +
-                  [[val.start(), val.start() +
-                  len(token)] for val in re.finditer(token, document)])
-    return matches, document
-def spacyAnnotator(matches: List[List[int]], document:spacy.tokens.doc.Doc):
-    """
-    This is spacy Annotator and needs spacy.doc
-    Annotates the text in the document defined by list of [start index, end index]
-    Example: "How are you today", if document type is text, matches = [[0,3]]
-    will give answer = "How", however in case we used the spacy matcher then the
-    matches = [[0,3]] will give answer = "How are you". However if spacy is used
-    to find "How" then the matches = [[0,1]] for the string defined above.
-    Params
-    -----------
-    matches: As mentioned its list of list. Example [[0,1],[10,13]]
-    document: document which needs to be indexed.
-    Return
-    --------
-    will send the output to either app front end using streamlit or
-    write directly to output screen.
-    """
-    start = 0
-    annotated_text = ""
-    for match in matches:
-        start_idx = match[0]
-        end_idx = match[1]
-        if check_streamlit():
-            annotated_text = (annotated_text + document[start:start_idx].text
-                            + str(annotation(body=document[start_idx:end_idx].text,
-                            label="ANSWER", background="#964448", color='#ffffff')))
-        else:
-            annotated_text = (annotated_text + document[start:start_idx].text
-                            + colored(document[start_idx:end_idx].text,
-                          "green", attrs = ['bold']))
-        start = end_idx
-    annotated_text = annotated_text + document[end_idx:].text
-    if check_streamlit():
-        st.write(
-                markdown(annotated_text),
-                unsafe_allow_html=True,
-            )
-    else:
-        print(annotated_text)
-def lexical_search(query:Text, documents:List[Document],top_k:int):
-    """
-    Performs the Lexical search on the List of haystack documents which is
-    returned by preprocessing Pipeline.
-    Params
-    -------
-    query: Keywords that need to be searche in documents.
-    documents: List of Haystack documents returned by preprocessing pipeline.
-    top_k: Number of Top results to be fetched.
-    """
-    document_store = InMemoryDocumentStore()
-    document_store.write_documents(documents)
-    # Haystack Retriever works with document stores only.
-    retriever = TfidfRetriever(document_store)
-    results = retriever.retrieve(query=query, top_k = top_k)
-    query_tokens = tokenize_lexical_query(query)
-    flag = True
-    for count, result in enumerate(results):
-        matches, doc = runSpacyMatcher(query_tokens,result.content)
-        if len(matches) != 0:
-            if flag:
-                flag = False
-                if check_streamlit():
-                    st.markdown("##### Top few lexical search (TFIDF) hits #####")
-                else:
-                    print("Top few lexical search (TFIDF) hits")
-            if check_streamlit():
-                st.write("Result {}".format(count+1))
-            else:
-                print("Results {}".format(count +1))
-            spacyAnnotator(matches, doc)
-    if flag:
-        if check_streamlit():
-            st.info("🤔 No relevant result found. Please try another keyword.")
-        else:
-            print("No relevant result found. Please try another keyword.")

utils/ndc_explorer.py DELETED Viewed

@@ -1,90 +0,0 @@
-import urllib.request
-import json
-link = "https://klimalog.die-gdi.de/ndc/open-data/dataset.json"
-def get_document(country_code: str):
-    """
-    read the country NDC data from
-    https://klimalog.die-gdi.de/ndc/open-data/dataset.json
-    using the country code.
-    Params
-    -------
-    country_code:"""
-    with urllib.request.urlopen(link) as urlfile:
-        data =  json.loads(urlfile.read())
-    categoriesData = {}
-    categoriesData['categories']= data['categories']
-    categoriesData['subcategories']= data['subcategories']
-    keys_sub = categoriesData['subcategories'].keys()
-    documentType= 'NDCs'
-    if documentType in data.keys():
-        if country_code in data[documentType].keys():
-            get_dict = {}
-            for key, value in data[documentType][country_code].items():
-                if key not in ['country_name','region_id', 'region_name']:
-                    get_dict[key] = value['classification']
-                else:
-                    get_dict[key] = value
-        else:
-            return None
-    else:
-        return None
-    country = {}
-    for key in categoriesData['categories']:
-        country[key]= {}
-    for key,value in categoriesData['subcategories'].items():
-        country[value['category']][key] = get_dict[key]
-    return country
-def countrySpecificCCA(cca_sent:dict, threshold:int, countryCode:str):
-    """
-    based on the countrycode, reads the country data from
-    https://klimalog.die-gdi.de/ndc/open-data/dataset.json
-    using get_documents from utils.ndc_explorer.py
-    then based on thereshold value filters the Climate Change Adaptation
-    targets assigned by NDC explorer team to that country. Using the sentences
-    create by Data services team of GIZ for each target level, tries to find the
-    relevant passages from the document by doing the semantic search.
-    Params
-    -------
-    cca_sent: dictionary with key as 'target labels' and manufactured sentences
-    reflecting the target level. Please see the docStore/ndcs/cca.txt
-    threshold: NDC target have many categoriees ranging from [0-5], with 0
-    refelcting most relaxed attitude and 5 being most aggrisive towards Climate
-    change. We select the threshold value beyond which we need to focus on.
-    countryCode: standard country code to allow us to fetch the country specific
-    data.
-    """
-    temp = {}
-    doc = get_document(countryCode)
-    for key,value in cca_sent.items():
-        id_ = doc['climate change adaptation'][key]['id']
-        if id_ >threshold:
-            temp[key] = value['id'][id_]
-    return temp
-def countrySpecificCCM(ccm_sent, threshold, countryCode):
-    """
-    see the documentation of countrySpecificCCA. This is same instead of
-    this gets the data pertaining to Adaptation
-    """
-    temp = {}
-    doc = get_document(countryCode)
-    for key,value in ccm_sent.items():
-        id_ = doc['climate change mitigation'][key]['id']
-        if id_ >threshold:
-            temp[key] = value['id'][id_]
-    return temp

utils/preprocessing.py DELETED Viewed

@@ -1,260 +0,0 @@
-from haystack.nodes.base import BaseComponent
-from haystack.schema import Document
-from haystack.nodes import PDFToTextOCRConverter, PDFToTextConverter
-from haystack.nodes import TextConverter, DocxToTextConverter, PreProcessor
-from typing import Callable, Dict, List, Optional, Text, Tuple, Union
-from typing_extensions import Literal
-import pandas as pd
-import logging
-import re
-import string
-from haystack.pipelines import Pipeline
-def useOCR(file_path: str)-> Text:
-    """
-    Converts image pdfs into text, Using the Farm-haystack[OCR]
-    Params
-    ----------
-    file_path: file_path of uploade file, returned by add_upload function in
-    uploadAndExample.py
-    Returns the text file as string.
-    """
-    converter = PDFToTextOCRConverter(remove_numeric_tables=True,
-                                      valid_languages=["eng"])
-    docs = converter.convert(file_path=file_path, meta=None)
-    return docs[0].content
-class FileConverter(BaseComponent):
-    """
-    Wrapper class to convert uploaded document into text by calling appropriate
-    Converter class, will use internally haystack PDFToTextOCR in case of image
-    pdf. Cannot use the FileClassifier from haystack as its doesnt has any
-    label/output class for image.
-    1. https://haystack.deepset.ai/pipeline_nodes/custom-nodes
-    2. https://docs.haystack.deepset.ai/docs/file_converters
-    3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/file_converter
-    4. https://docs.haystack.deepset.ai/reference/file-converters-api
-    """
-    outgoing_edges = 1
-    def run(self, file_name: str , file_path: str, encoding: Optional[str]=None,
-            id_hash_keys: Optional[List[str]] = None,
-            ) -> Tuple[dict,str]:
-        """ this is required method to invoke the component in
-            the pipeline implementation.
-        Params
-        ----------
-        file_name: name of file
-        file_path: file_path of uploade file, returned by add_upload function in
-                    uploadAndExample.py
-        See the links provided in Class docstring/description to see other params
-        Return
-        ---------
-        output: dictionary, with key as identifier and value could be anything
-                we need to return. In this case its the List of Hasyatck Document
-        output_1: As there is only one outgoing edge, we pass 'output_1' string
-        """
-        try:
-            if file_name.endswith('.pdf'):
-                converter = PDFToTextConverter(remove_numeric_tables=True)
-            if file_name.endswith('.txt'):
-                converter = TextConverter(remove_numeric_tables=True)
-            if file_name.endswith('.docx'):
-                converter = DocxToTextConverter()
-        except Exception as e:
-            logging.error(e)
-            return
-        documents = []
-        document = converter.convert(
-                      file_path=file_path, meta=None,
-                      encoding=encoding, id_hash_keys=id_hash_keys
-                      )[0]
-        text = document.content
-        # if file is image pdf then it will have {'content': "\x0c\x0c\x0c\x0c"}
-        # subsitute this substring with '',and check if content is empty string
-        text = re.sub(r'\x0c', '', text)
-        documents.append(Document(content=text,
-                              meta={"name": file_name},
-                              id_hash_keys=id_hash_keys))
-        # check if text is empty and apply pdfOCR converter.
-        for i in documents:
-            if i.content == "":
-                logging.info("Using OCR")
-                i.content = useOCR(file_path)
-        logging.info('file conversion succesful')
-        output = {'documents': documents}
-        return output, 'output_1'
-    def run_batch():
-        """
-        we dont have requirement to process the multiple files in one go
-        therefore nothing here, however to use the custom node we need to have
-        this method for the class.
-        """
-        return
-def basic(s:str, remove_punc:bool = False):
-    """
-    Performs basic cleaning of text.
-    Params
-    ----------
-    s: string to be processed
-    removePunc: to remove all Punctuation including ',' and '.' or not
-    Returns: processed string: see comments in the source code for more info
-    """
-    # Remove URLs
-    s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
-    s = re.sub(r"http\S+", " ", s)
-    # Remove new line characters
-    s = re.sub('\n', ' ', s)
-    # Remove punctuations
-    if remove_punc == True:
-      translator = str.maketrans(' ', ' ', string.punctuation)
-      s = s.translate(translator)
-    # Remove distracting single quotes and dotted pattern
-    s = re.sub("\'", " ", s)
-    s = s.replace("..","")
-    return s.strip()
-class UdfPreProcessor(BaseComponent):
-    """
-    class to preprocess the document returned by FileConverter. It will check
-    for splitting strategy and splits the document by word or sentences and then
-    synthetically create the paragraphs.
-    1. https://docs.haystack.deepset.ai/docs/preprocessor
-    2. https://docs.haystack.deepset.ai/reference/preprocessor-api
-    3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/preprocessor
-    """
-    outgoing_edges = 1
-    def run(self, documents:List[Document], remove_punc:bool=False,
-            split_by: Literal["sentence", "word"] = 'sentence',
-            split_length:int = 2, split_respect_sentence_boundary:bool = False,
-            split_overlap:int = 0):
-        """ this is required method to invoke the component in
-        the pipeline implementation.
-        Params
-        ----------
-        documents: documents from the output dictionary returned by Fileconverter
-        remove_punc: to remove all Punctuation including ',' and '.' or not
-        split_by: document splitting strategy either as word or sentence
-        split_length: when synthetically creating the paragrpahs from document,
-                      it defines the length of paragraph.
-        split_respect_sentence_boundary: Used when using 'word' strategy for
-        splititng of text.
-        split_overlap: Number of words or sentences that overlap when creating
-        the paragraphs. This is done as one sentence or 'some words' make sense
-        when  read in together with others. Therefore the overlap is used.
-        Return
-        ---------
-        output: dictionary, with key as identifier and value could be anything
-                we need to return. In this case the output will contain 4 objects
-                the paragraphs text list as List, Haystack document, Dataframe and
-                one raw text file.
-        output_1: As there is only one outgoing edge, we pass 'output_1' string
-        """
-        if split_by == 'sentence':
-            split_respect_sentence_boundary = False
-        else:
-            split_respect_sentence_boundary = split_respect_sentence_boundary
-        preprocessor = PreProcessor(
-            clean_empty_lines=True,
-            clean_whitespace=True,
-            clean_header_footer=True,
-            split_by=split_by,
-            split_length=split_length,
-            split_respect_sentence_boundary= split_respect_sentence_boundary,
-            split_overlap=split_overlap,
-            # will add page number only in case of PDF not for text/docx file.
-            add_page_number=True
-            )
-        for i in documents:
-            # # basic cleaning before passing it to preprocessor.
-            # i = basic(i)
-            docs_processed = preprocessor.process([i])
-            for item in docs_processed:
-                item.content = basic(item.content, remove_punc= remove_punc)
-        df = pd.DataFrame(docs_processed)
-        all_text = " ".join(df.content.to_list())
-        para_list = df.content.to_list()
-        logging.info('document split into {} paragraphs'.format(len(para_list)))
-        output = {'documents': docs_processed,
-                  'dataframe': df,
-                  'text': all_text,
-                  'paraList': para_list
-                 }
-        return output, "output_1"
-    def run_batch():
-        """
-            we dont have requirement to process the multiple files in one go
-            therefore nothing here, however to use the custom node we need to have
-            this method for the class.
-        """
-        return
-def processingpipeline():
-    """
-    Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
-    from utils.preprocessing
-    """
-    preprocessing_pipeline = Pipeline()
-    file_converter = FileConverter()
-    custom_preprocessor = UdfPreProcessor()
-    preprocessing_pipeline.add_node(component=file_converter,
-                                    name="FileConverter", inputs=["File"])
-    preprocessing_pipeline.add_node(component = custom_preprocessor,
-                            name ='UdfPreProcessor', inputs=["FileConverter"])
-    return preprocessing_pipeline

utils/sdg_classifier.py DELETED Viewed

@@ -1,177 +0,0 @@
-from haystack.nodes import TransformersDocumentClassifier
-from haystack.schema import Document
-from typing import List, Tuple
-from typing_extensions import Literal
-import logging
-import pandas as pd
-from pandas import DataFrame, Series
-from utils.checkconfig import getconfig
-from utils.streamlitcheck import check_streamlit
-from utils.preprocessing import processingpipeline
-try:
-    import streamlit as st
-except ImportError:
-    logging.info("Streamlit not installed")
-## Labels dictionary ###
-_lab_dict = {0: 'no_cat',
-            1:'SDG 1 - No poverty',
-            2:'SDG 2 - Zero hunger',
-            3:'SDG 3 - Good health and well-being',
-            4:'SDG 4 - Quality education',
-            5:'SDG 5 - Gender equality',
-            6:'SDG 6 - Clean water and sanitation',
-            7:'SDG 7 - Affordable and clean energy',
-            8:'SDG 8 - Decent work and economic growth',
-            9:'SDG 9 - Industry, Innovation and Infrastructure',
-            10:'SDG 10 - Reduced inequality',
-            11:'SDG 11 - Sustainable cities and communities',
-            12:'SDG 12 - Responsible consumption and production',
-            13:'SDG 13 - Climate action',
-            14:'SDG 14 - Life below water',
-            15:'SDG 15 - Life on land',
-            16:'SDG 16 - Peace, justice and strong institutions',
-            17:'SDG 17 - Partnership for the goals',}
-@st.cache(allow_output_mutation=True)
-def load_sdgClassifier(config_file:str = None, classifier_name:str = None):
-    """
-    loads the document classifier using haystack, where the name/path of model
-    in HF-hub as string is used to fetch the model object.Either configfile or
-    model should be passed.
-    1. https://docs.haystack.deepset.ai/reference/document-classifier-api
-    2. https://docs.haystack.deepset.ai/docs/document_classifier
-    Params
-    --------
-    config_file: config file path from which to read the model name
-    classifier_name: if modelname is passed, it takes a priority if not \
-    found then will look for configfile, else raise error.
-    Return: document classifier model
-    """
-    if not classifier_name:
-        if not config_file:
-            logging.warning("Pass either model name or config file")
-            return
-        else:
-            config = getconfig(config_file)
-            classifier_name = config.get('sdg','MODEL')
-    logging.info("Loading classifier")
-    doc_classifier = TransformersDocumentClassifier(
-                        model_name_or_path=classifier_name,
-                        task="text-classification")
-    return doc_classifier
-@st.cache(allow_output_mutation=True)
-def sdg_classification(haystack_doc:List[Document],
-                        threshold:float = 0.8,
-                        classifier_model:TransformersDocumentClassifier= None
-                        )->Tuple[DataFrame,Series]:
-    """
-    Text-Classification on the list of texts provided. Classifier provides the
-    most appropriate label for each text. these labels are in terms of if text
-    belongs to which particular Sustainable Devleopment Goal (SDG).
-    Params
-    ---------
-    haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
-    contains the list of paragraphs in different format,here the list of
-    Haystack Documents is used.
-    threshold: threshold value for the model to keep the results from classifier
-    classifiermodel: you can pass the classifier model directly,which takes priority
-    however if not then looks for model in streamlit session.
-    In case of streamlit avoid passing the model directly.
-    Returns
-    ----------
-    df: Dataframe with two columns['SDG:int', 'text']
-    x: Series object with the unique SDG covered in the document uploaded and
-    the number of times it is covered/discussed/count_of_paragraphs.
-    """
-    logging.info("Working on SDG Classification")
-    if not classifier_model:
-        if check_streamlit():
-            classifier_model = st.session_state['sdg_classifier']
-        else:
-            logging.warning("No streamlit envinornment found, Pass the classifier")
-            return
-    results = classifier_model.predict(haystack_doc)
-    labels_= [(l.meta['classification']['label'],
-            l.meta['classification']['score'],l.content,) for l in results]
-    df = DataFrame(labels_, columns=["SDG","Relevancy","text"])
-    df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
-    df.index += 1
-    df =df[df['Relevancy']>threshold]
-    # creating the dataframe for value counts of SDG, along with 'title' of SDGs
-    x = df['SDG'].value_counts()
-    x = x.rename('count')
-    x = x.rename_axis('SDG').reset_index()
-    x["SDG"] = pd.to_numeric(x["SDG"])
-    x = x.sort_values(by=['count'], ascending=False)
-    x['SDG_name'] = x['SDG'].apply(lambda x: _lab_dict[x])
-    x['SDG_Num'] = x['SDG'].apply(lambda x: "SDG "+str(x))
-    df['SDG'] = pd.to_numeric(df['SDG'])
-    df = df.sort_values('SDG')
-    return df, x
-def runSDGPreprocessingPipeline(file_name:str, file_path:str,
-            split_by: Literal["sentence", "word"] = 'sentence',
-            split_length:int = 2, split_respect_sentence_boundary:bool = False,
-            split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
-    """
-    creates the pipeline and runs the preprocessing pipeline,
-    the params for pipeline are fetched from paramconfig
-    Params
-    ------------
-    file_name: filename, in case of streamlit application use
-    st.session_state['filename']
-    file_path: filepath, in case of streamlit application use st.session_state['filepath']
-    split_by: document splitting strategy either as word or sentence
-    split_length: when synthetically creating the paragrpahs from document,
-                    it defines the length of paragraph.
-    split_respect_sentence_boundary: Used when using 'word' strategy for
-    splititng of text.
-    split_overlap: Number of words or sentences that overlap when creating
-        the paragraphs. This is done as one sentence or 'some words' make sense
-        when  read in together with others. Therefore the overlap is used.
-    remove_punc: to remove all Punctuation including ',' and '.' or not
-    Return
-    --------------
-    List[Document]: When preprocessing pipeline is run, the output dictionary
-    has four objects. For the Haysatck implementation of SDG classification we,
-    need to use the List of Haystack Document, which can be fetched by
-    key = 'documents' on output.
-    """
-    sdg_processing_pipeline = processingpipeline()
-    output_sdg_pre = sdg_processing_pipeline.run(file_paths = file_path,
-                            params= {"FileConverter": {"file_path": file_path, \
-                                        "file_name": file_name},
-                                     "UdfPreProcessor": {"remove_punc": remove_punc, \
-                                            "split_by": split_by, \
-                                            "split_length":split_length,\
-                                            "split_overlap": split_overlap, \
-        "split_respect_sentence_boundary":split_respect_sentence_boundary}})
-    return output_sdg_pre

utils/semantic_search.py DELETED Viewed

@@ -1,582 +0,0 @@
-from haystack.nodes import TransformersQueryClassifier, Docs2Answers
-from haystack.nodes import EmbeddingRetriever, FARMReader
-from haystack.nodes.base import BaseComponent
-from haystack.document_stores import InMemoryDocumentStore
-from markdown import markdown
-from annotated_text import annotation
-from haystack.schema import Document
-from typing import List, Text, Union
-from typing_extensions import Literal
-from utils.preprocessing import processingpipeline
-from utils.streamlitcheck import check_streamlit
-from haystack.pipelines import Pipeline
-import pandas as pd
-import logging
-try:
-    from termcolor import colored
-except:
-    pass
-try:
-    import streamlit as st
-except ImportError:
-    logging.info("Streamlit not installed")
-@st.cache(allow_output_mutation=True)
-def loadQueryClassifier():
-    """
-    retuns the haystack query classifier model
-    model = shahrukhx01/bert-mini-finetune-question-detection
-    """
-    query_classifier = TransformersQueryClassifier(model_name_or_path=
-                            "shahrukhx01/bert-mini-finetune-question-detection")
-    return query_classifier
-class QueryCheck(BaseComponent):
-    """
-    Uses Query Classifier from Haystack, process the query based on query type.
-    Ability to determine the statements is not so good, therefore the chances
-    statement also get modified. Ex: "List water related issues" will be
-    identified by the model as keywords, and therefore it be processed as "what
-    are the 'list all water related issues' related issues and discussions?".
-    This is one shortcoming but is igonred for now, as semantic search will not
-    get affected a lot, by this. If you want to pass keywords list and want to
-    do batch processing use. run_batch. Example: if you want to find relevant
-    passages for water, food security, poverty then querylist = ["water", "food
-    security","poverty"] and then execute QueryCheck.run_batch(queries = querylist)
-    1. https://docs.haystack.deepset.ai/docs/query_classifier
-    """
-    outgoing_edges = 1
-    def run(self, query:str):
-        """
-        mandatory method to use the custom node. Determines the query type, if
-        if the query is of type keyword/statement will modify it to make it more
-        useful for sentence transoformers.
-        Params
-        --------
-        query: query/statement/keywords in form of string
-        Return
-        ------
-        output: dictionary, with key as identifier and value could be anything
-                we need to return. In this case the output contain key = 'query'.
-        output_1: As there is only one outgoing edge, we pass 'output_1' string
-        """
-        query_classifier = loadQueryClassifier()
-        result = query_classifier.run(query=query)
-        if result[1] == "output_1":
-            output = {"query":query,
-                       "query_type": 'question/statement'}
-        else:
-            output = {"query": "what are the {} related issues and \
-                        discussions?".format(query),
-                      "query_type": 'statements/keyword'}
-        logging.info(output)
-        return output, "output_1"
-    def run_batch(self, queries:List[str]):
-        """
-        running multiple queries in one go, howeevr need the queries to be passed
-        as list of string. Example: if you want to find relevant passages for
-        water, food security, poverty then querylist = ["water", "food security",
-        "poverty"] and then execute QueryCheck.run_batch(queries = querylist)
-        Params
-        --------
-        queries: queries/statements/keywords in form of string encapsulated
-                within List
-        Return
-        ------
-        output: dictionary, with key as identifier and value could be anything
-                we need to return. In this case the output contain key = 'queries'.
-        output_1: As there is only one outgoing edge, we pass 'output_1' string
-        """
-        query_classifier = loadQueryClassifier()
-        query_list = []
-        for query in queries:
-            result = query_classifier.run(query=query)
-            if result[1] == "output_1":
-                query_list.append(query)
-            else:
-                query_list.append("what are the {} related issues and \
-                    discussions?".format(query))
-        output = {'queries':query_list}
-        logging.info(output)
-        return output, "output_1"
-@st.cache(allow_output_mutation=True)
-def runSemanticPreprocessingPipeline(file_path:str, file_name:str,
-                split_by: Literal["sentence", "word"] = 'sentence',
-                split_length:int = 2, split_overlap:int = 0,
-                split_respect_sentence_boundary:bool = False,
-                remove_punc:bool = False)->List[Document]:
-    """
-    creates the pipeline and runs the preprocessing pipeline.
-    Params
-    ------------
-    file_name: filename, in case of streamlit application use
-            st.session_state['filename']
-    file_path: filepath, in case of streamlit application use
-            st.session_state['filepath']
-    split_by: document splitting strategy either as word or sentence
-    split_length: when synthetically creating the paragrpahs from document,
-            it defines the length of paragraph.
-    split_overlap: Number of words or sentences that overlap when creating the
-            paragraphs. This is done as one sentence or 'some words' make sense
-            when  read in together with others. Therefore the overlap is used.
-    split_respect_sentence_boundary: Used when using 'word' strategy for
-            splititng of text.
-    remove_punc: to remove all Punctuation including ',' and '.' or not
-    Return
-    --------------
-    List[Document]: When preprocessing pipeline is run, the output dictionary
-        has four objects. For the Haysatck implementation of semantic search we,
-        need to use the List of Haystack Document, which can be fetched by
-        key = 'documents' on output.
-    """
-    semantic_processing_pipeline = processingpipeline()
-    output_semantic_pre = semantic_processing_pipeline.run(file_paths = file_path,
-                            params= {"FileConverter": {"file_path": file_path, \
-                                        "file_name": file_name},
-                                "UdfPreProcessor": {"remove_punc": remove_punc, \
-                                            "split_by": split_by, \
-                                            "split_length":split_length,\
-                                            "split_overlap": split_overlap,
-        "split_respect_sentence_boundary":split_respect_sentence_boundary}})
-    return output_semantic_pre
-@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
-                                        allow_output_mutation=True)
-def loadRetriever(embedding_model:Text=None, embedding_model_format:Text = None,
-                 embedding_layer:int = None,  retriever_top_k:int = 10,
-                 max_seq_len:int=512, document_store:InMemoryDocumentStore=None):
-    """
-    Returns the Retriever model based on params provided.
-    1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
-    2. https://www.sbert.net/examples/applications/semantic-search/README.html
-    3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
-    Params
-    ---------
-    embedding_model: Name of the model to be used for embedding. Check the links
-            provided in documentation
-    embedding_model_format: check the github link of Haystack provided in
-            documentation embedding_layer: check the github link of Haystack
-            provided in documentation retriever_top_k: Number of Top results to
-            be returned by
-    retriever max_seq_len: everymodel has max seq len it can handle, check in
-            model card. Needed to hanlde the edge cases.
-    document_store: InMemoryDocumentStore, write haystack Document list to
-            DocumentStore and pass the same to function call. Can be done using
-            createDocumentStore from utils.
-    Return
-    -------
-    retriever: embedding model
-    """
-    logging.info("loading retriever")
-    if document_store is None:
-        logging.warning("Retriever initialization requires the DocumentStore")
-        return
-    retriever = EmbeddingRetriever(
-                embedding_model=embedding_model,top_k = retriever_top_k,
-                document_store = document_store,
-                emb_extraction_layer=embedding_layer, scale_score =True,
-                model_format=embedding_model_format, use_gpu = True,
-                max_seq_len = max_seq_len )
-    if check_streamlit:
-        st.session_state['retriever'] = retriever
-    return retriever
-@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
-                    allow_output_mutation=True)
-def createDocumentStore(documents:List[Document], similarity:str = 'dot_product',
-                        embedding_dim:int = 768):
-    """
-    Creates the InMemory Document Store from haystack list of Documents.
-    It is  mandatory component for Retriever to work in Haystack frame work.
-    Params
-    -------
-    documents: List of haystack document. If using the preprocessing pipeline,
-            can be fetched key = 'documents; on output of preprocessing pipeline.
-    similarity: scoring function, can be either 'cosine' or 'dot_product'
-    embedding_dim: Document store has default value of embedding size = 768, and
-            update_embeddings method of Docstore cannot infer the embedding size of
-            retiever automatically, therefore set this value as per the model card.
-    Return
-    -------
-    document_store: InMemory Document Store object type.
-    """
-    document_store = InMemoryDocumentStore(similarity = similarity,
-                                        embedding_dim = embedding_dim )
-    document_store.write_documents(documents)
-    return document_store
-@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
-                                        allow_output_mutation=True)
-def semanticSearchPipeline(documents:List[Document], embedding_model:Text =  None,
-                embedding_model_format:Text = None,embedding_layer:int = None,
-                embedding_dim:int = 768,retriever_top_k:int = 10,
-                reader_model:str =  None, reader_top_k:int = 10,
-                max_seq_len:int =512,useQueryCheck = True,
-                top_k_per_candidate:int = 1):
-    """
-    creates the semantic search pipeline and document Store object from the
-    list of haystack documents. The top_k for the Reader and Retirever are kept
-    same, so that all the results returned by Retriever are used, however the
-    context is extracted by Reader for each retrieved result. The querycheck is
-    added as node to process the query. This pipeline is suited for keyword search,
-    and to some extent extractive QA purpose. The purpose of Reader is strictly to
-    highlight the context for retrieved result and not for QA, however as stated
-    it can work for QA too in limited sense.
-    There are 4 variants of pipeline it can return
-    1.QueryCheck > Retriever > Reader
-    2.Retriever > Reader
-    3.QueryCheck > Retriever > Docs2Answers : If reader is None,
-    then Doc2answer is used to keep the output of pipeline structurally same.
-    4.Retriever > Docs2Answers
-    Links
-    1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
-    2. https://www.sbert.net/examples/applications/semantic-search/README.html
-    3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
-    4. https://docs.haystack.deepset.ai/docs/reader
-    Params
-    ----------
-    documents: list of Haystack Documents, returned by preprocessig pipeline.
-    embedding_model: Name of the model to be used for embedding. Check the links
-            provided in documentation
-    embedding_model_format: check the github link of Haystack provided in
-            documentation
-    embedding_layer: check the github link of Haystack provided in documentation
-    embedding_dim: Document store has default value of embedding size = 768, and
-            update_embeddings method of Docstore cannot infer the embedding size of
-            retiever automatically, therefore set this value as per the model card.
-    retriever_top_k: Number of Top results to be returned by retriever
-    reader_model: Name of the model to be used for Reader node in hasyatck
-            Pipeline. Check the links provided in documentation
-    reader_top_k: Reader will use retrieved results to further find better matches.
-            As purpose here is to use reader to extract context, the value is
-            same as retriever_top_k.
-    max_seq_len:everymodel has max seq len it can handle, check in model card.
-            Needed to hanlde the edge cases
-    useQueryCheck: Whether to use the querycheck which modifies the query or not.
-    top_k_per_candidate:How many answers to extract for each candidate doc
-            that is coming from the retriever
-    Return
-    ---------
-    semanticsearch_pipeline: Haystack Pipeline object, with all the necessary
-            nodes [QueryCheck, Retriever, Reader/Docs2Answer]. If reader is None,
-            then Doc2answer is used to keep the output of pipeline structurally
-            same.
-    document_store: As retriever can work only with Haystack Document Store, the
-            list of document returned by preprocessing pipeline are fed into to
-            get InMemmoryDocumentStore object type, with retriever updating the
-            embeddings of each paragraph in document store.
-    """
-    document_store = createDocumentStore(documents=documents,
-                                    embedding_dim=embedding_dim)
-    retriever = loadRetriever(embedding_model = embedding_model,
-                    embedding_model_format=embedding_model_format,
-                    embedding_layer=embedding_layer,
-                    retriever_top_k= retriever_top_k,
-                    document_store = document_store,
-                    max_seq_len=max_seq_len)
-    document_store.update_embeddings(retriever)
-    semantic_search_pipeline = Pipeline()
-    if useQueryCheck and reader_model:
-        querycheck = QueryCheck()
-        reader = FARMReader(model_name_or_path=reader_model,
-                    top_k = reader_top_k, use_gpu=True,
-                    top_k_per_candidate = top_k_per_candidate)
-        semantic_search_pipeline.add_node(component = querycheck,
-                    name = "QueryCheck",inputs = ["Query"])
-        semantic_search_pipeline.add_node(component = retriever,
-                    name = "EmbeddingRetriever",inputs = ["QueryCheck.output_1"])
-        semantic_search_pipeline.add_node(component = reader, name = "FARMReader",
-                                        inputs= ["EmbeddingRetriever"])
-    elif reader_model :
-        reader = FARMReader(model_name_or_path=reader_model,
-                    top_k = reader_top_k, use_gpu=True,
-                    top_k_per_candidate = top_k_per_candidate)
-        semantic_search_pipeline.add_node(component = retriever,
-                    name = "EmbeddingRetriever",inputs = ["Query"])
-        semantic_search_pipeline.add_node(component = reader,
-                    name = "FARMReader",inputs= ["EmbeddingRetriever"])
-    elif useQueryCheck and not reader_model:
-        querycheck = QueryCheck()
-        docs2answers = Docs2Answers()
-        semantic_search_pipeline.add_node(component = querycheck,
-                        name = "QueryCheck",inputs = ["Query"])
-        semantic_search_pipeline.add_node(component = retriever,
-                        name = "EmbeddingRetriever",inputs = ["QueryCheck.output_1"])
-        semantic_search_pipeline.add_node(component = docs2answers,
-                        name = "Docs2Answers",inputs= ["EmbeddingRetriever"])
-    elif not useQueryCheck and not reader_model:
-        docs2answers = Docs2Answers()
-        semantic_search_pipeline.add_node(component = retriever,
-                        name = "EmbeddingRetriever",inputs = ["Query"])
-        semantic_search_pipeline.add_node(component = docs2answers,
-                        name = "Docs2Answers",inputs= ["EmbeddingRetriever"])
-    logging.info(semantic_search_pipeline.components)
-    return semantic_search_pipeline, document_store
-def runSemanticPipeline(pipeline:Pipeline, queries:Union[list,str])->dict:
-    """
-    will use the haystack run or run_batch based on if single query is passed
-    as string or multiple queries as List[str]
-    Params
-    -------
-    pipeline: haystack pipeline, this is same as returned by semanticSearchPipeline
-            from utils.semanticsearch
-    queries: Either a single query or list of queries.
-    Return
-    -------
-    results: Dict containing answers and documents as key and their respective
-            values
-    """
-    if type(queries) == list:
-        results = pipeline.run_batch(queries=queries)
-    elif type(queries) == str:
-        results = pipeline.run(query=queries)
-    else:
-        logging.info("Please check the input type for the queries")
-        return
-    return results
-def process_query_output(results:dict)->pd.DataFrame:
-    """
-    Returns the dataframe with necessary information like including
-    ['query','answer','answer_offset','context_offset','context','content',
-    'reader_score','retriever_score','id',]. This is designed for output given
-    by semantic search pipeline with single query and final node as reader.
-    The output of pipeline having Docs2Answers as final node or multiple queries
-    need to be handled separately. In these other cases, use process_semantic_output
-    from utils.semantic_search which uses this function internally to make one
-    combined dataframe.
-    Params
-    ---------
-    results: this dictionary should have key,values with
-            keys = [query,answers,documents], however answers is optional.
-            in case of [Doc2Answers as final node], process_semantic_output
-            doesnt return answers thereby setting all values contained in
-            answers to 'None'
-    Return
-    --------
-    df: dataframe with all the columns mentioned in function description.
-    """
-    query_text = results['query']
-    if 'answers' in results.keys():
-        answer_dict = {}
-        for answer in results['answers']:
-            answer_dict[answer.document_id] = answer.to_dict()
-    else:
-        answer_dict = {}
-    docs = results['documents']
-    df = pd.DataFrame(columns=['query','answer','answer_offset','context_offset',
-                            'context','content','reader_score','retriever_score',
-                            'id'])
-    for doc in docs:
-        row_list = {}
-        row_list['query'] = query_text
-        row_list['retriever_score'] = doc.score
-        row_list['id'] = doc.id
-        row_list['content'] = doc.content
-        if doc.id in answer_dict.keys():
-            row_list['answer'] = answer_dict[doc.id]['answer']
-            row_list['context'] = answer_dict[doc.id]['context']
-            row_list['reader_score'] = answer_dict[doc.id]['score']
-            answer_offset = answer_dict[doc.id]['offsets_in_document'][0]
-            row_list['answer_offset'] = [answer_offset['start'],answer_offset['end']]
-            start_idx = doc.content.find(row_list['context'])
-            end_idx = start_idx + len(row_list['context'])
-            row_list['context_offset'] = [start_idx, end_idx]
-        else:
-            row_list['answer'] = None
-            row_list['context'] = None
-            row_list['reader_score'] = None
-            row_list['answer_offset'] = None
-            row_list['context_offset'] = None
-        df_dictionary = pd.DataFrame([row_list])
-        df = pd.concat([df, df_dictionary], ignore_index=True)
-    return df
-def process_semantic_output(results):
-    """
-    Returns the dataframe with necessary information like including
-    ['query','answer','answer_offset','context_offset','context','content',
-    'reader_score','retriever_score','id',]. Distingushes if its single query or
-    multi queries by reading the pipeline output dictionary keys.
-    Uses the process_query_output to get the dataframe for each query and create
-    one concataneted dataframe. In case of Docs2Answers as final node, deletes
-    the answers part. See documentations of process_query_output.
-    Params
-    ---------
-    results: raw output of runSemanticPipeline.
-    Return
-    --------
-    df: dataframe with all the columns mentioned in function description.
-    """
-    output = {}
-    if 'query' in results.keys():
-        output['query'] = results['query']
-        output['documents'] = results['documents']
-        if results['node_id'] == 'Docs2Answers':
-            pass
-        else:
-            output['answers'] = results['answers']
-        df = process_query_output(output)
-        return df
-    if 'queries' in results.keys():
-        df = pd.DataFrame(columns=['query','answer','answer_offset',
-                                   'context_offset','context','content',
-                                   'reader_score','retriever_score','id'])
-        for query,answers,documents in zip(results['queries'],
-                    results['answers'],results['documents']):
-            output = {}
-            output['query'] = query
-            output['documents'] = documents
-            if results['node_id'] == 'Docs2Answers':
-                    pass
-            else:
-                output['answers'] = answers
-            temp = process_query_output(output)
-            df = pd.concat([df, temp], ignore_index=True)
-    return df
-def semanticsearchAnnotator(matches:List[List[int]], document:Text):
-    """
-    Annotates the text in the document defined by list of [start index, end index]
-    Example: "How are you today", if document type is text, matches = [[0,3]]
-    will give answer = "How", however in case we used the spacy matcher then the
-    matches = [[0,3]] will give answer = "How are you". However if spacy is used
-    to find "How" then the matches = [[0,1]] for the string defined above.
-    """
-    start = 0
-    annotated_text = ""
-    for match in matches:
-        start_idx = match[0]
-        end_idx = match[1]
-        if check_streamlit():
-            annotated_text = (annotated_text + document[start:start_idx]
-                            + str(annotation(body=document[start_idx:end_idx],
-                            label="Context", background="#964448", color='#ffffff')))
-        else:
-            annotated_text = (annotated_text + document[start:start_idx]
-                            + colored(document[start_idx:end_idx],
-                          "green", attrs = ['bold']))
-        start = end_idx
-    annotated_text = annotated_text + document[end_idx:]
-    if check_streamlit():
-        st.write(
-                markdown(annotated_text),
-                unsafe_allow_html=True,
-            )
-    else:
-        print(annotated_text)
-def semantic_keywordsearch(query:Text,documents:List[Document],
-                embedding_model:Text,
-                embedding_model_format:Text,
-                embedding_layer:int,  reader_model:str,
-                retriever_top_k:int = 10, reader_top_k:int = 10,
-                return_results:bool = False, embedding_dim:int = 768,
-                max_seq_len:int = 512,top_k_per_candidate:int =1,
-                sort_by:Literal["retriever", "reader"] = 'retriever'):
-    """
-    Performs the Semantic search on the List of haystack documents which is
-    returned by preprocessing Pipeline.
-    Params
-    -------
-    query: Keywords that need to be searche in documents.
-    documents: List fo Haystack documents returned by preprocessing pipeline.
-    """
-    semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents = documents,
-                        embedding_model= embedding_model,
-                        embedding_layer= embedding_layer,
-                        embedding_model_format= embedding_model_format,
-                        reader_model= reader_model, retriever_top_k= retriever_top_k,
-                        reader_top_k= reader_top_k, embedding_dim=embedding_dim,
-                        max_seq_len=max_seq_len,
-                        top_k_per_candidate=top_k_per_candidate)
-    raw_output = runSemanticPipeline(semanticsearch_pipeline,query)
-    results_df = process_semantic_output(raw_output)
-    if sort_by == 'retriever':
-        results_df = results_df.sort_values(by=['retriever_score'], ascending=False)
-    else:
-        results_df = results_df.sort_values(by=['reader_score'], ascending=False)
-    if return_results:
-        return results_df
-    else:
-        if check_streamlit:
-            st.markdown("##### Top few semantic search results #####")
-        else:
-            print("Top few semantic search results")
-        for i in range(len(results_df)):
-            if check_streamlit:
-                st.write("Result {}".format(i+1))
-            else:
-                print("Result {}".format(i+1))
-            semanticsearchAnnotator([results_df.loc[i]['context_offset']],
-                        results_df.loc[i]['content'] )

utils/streamlitcheck.py DELETED Viewed

@@ -1,42 +0,0 @@
-import logging
-try:
-    import streamlit as st
-except ImportError:
-    logging.info("Streamlit not installed")
-def check_streamlit():
-    """
-    Function to check whether python code is run within streamlit
-    Returns
-    -------
-    use_streamlit : boolean
-        True if code is run within streamlit, else False
-    """
-    try:
-        from streamlit.scriptrunner.script_run_context import get_script_run_ctx
-        if not get_script_run_ctx():
-            use_streamlit = False
-        else:
-            use_streamlit = True
-    except ModuleNotFoundError:
-        use_streamlit = False
-    return use_streamlit
-def disable_other_checkboxes(*other_checkboxes_keys):
-    for checkbox_key in other_checkboxes_keys:
-        st.session_state[checkbox_key] = False
-def checkbox_without_preselect(keylist):
-    dict_ = {}
-    for i,key_val in enumerate(keylist):
-        dict_[i] = st.checkbox(key_val,key = key_val,
-        on_change = disable_other_checkboxes,
-        args=tuple(list(filter(lambda x: x!= key_val, keylist))),)
-    for key,val in dict_.items():
-        if val == True:
-            return keylist[int(key)]
-    return None

utils/uploadAndExample.py DELETED Viewed

@@ -1,33 +0,0 @@
-import streamlit as st
-import tempfile
-import json
-def add_upload(choice):
-    """
-    Provdies the user with choice to either 'Upload Document' or 'Try Example'.
-    Based on user choice runs streamlit processes and save the path and name of
-    the 'file' to streamlit session_state which then can be fetched later.
-    """
-    if choice == 'Upload Document':
-        uploaded_file = st.sidebar.file_uploader('Upload the File',
-                            type=['pdf', 'docx', 'txt'])
-        if uploaded_file is not None:
-            with tempfile.NamedTemporaryFile(mode="wb", delete = False) as temp:
-                bytes_data = uploaded_file.getvalue()
-                temp.write(bytes_data)
-                st.session_state['filename'] = uploaded_file.name
-                st.session_state['filepath'] = temp.name
-    else:
-        # listing the options
-        with open('docStore/sample/files.json','r') as json_file:
-            files = json.load(json_file)
-        option = st.sidebar.selectbox('Select the example document',
-                              list(files.keys()))
-        file_name = file_path  = files[option]
-        st.session_state['filename'] = file_name
-        st.session_state['filepath'] = file_path