Spaces:

GIZ
/

SDSN-demo

Running on CPU Upgrade

App Files Files Community

prashant commited on Nov 14, 2022

Commit

fc140bc

1 Parent(s): 44648c8

final fix in SDG

Browse files

Files changed (5) hide show

appStore/sdg_analysis.py +28 -26
utils/checkconfig.py +12 -0
utils/keyword_extraction.py +31 -18
utils/preprocessing.py +2 -0
utils/sdg_classifier.py +44 -51

appStore/sdg_analysis.py CHANGED Viewed

@@ -11,12 +11,24 @@ import streamlit as st
 from st_aggrid import AgGrid
 from st_aggrid.shared import ColumnsAutoSizeMode
 from utils.sdg_classifier import sdg_classification
-from utils.sdg_classifier import runSDGPreprocessingPipeline
-from utils.keyword_extraction import keywordExtraction, textrank
 import logging
 logger = logging.getLogger(__name__)
 def app():
@@ -71,35 +83,22 @@ def app():
             """)
         st.markdown("")
-    ### Label Dictionary ###
-    _lab_dict = {0: 'no_cat',
-                1:'SDG 1 - No poverty',
-                    2:'SDG 2 - Zero hunger',
-                    3:'SDG 3 - Good health and well-being',
-                    4:'SDG 4 - Quality education',
-                    5:'SDG 5 - Gender equality',
-                    6:'SDG 6 - Clean water and sanitation',
-                    7:'SDG 7 - Affordable and clean energy',
-                    8:'SDG 8 - Decent work and economic growth',
-                    9:'SDG 9 - Industry, Innovation and Infrastructure',
-                    10:'SDG 10 - Reduced inequality',
-                11:'SDG 11 - Sustainable cities and communities',
-                12:'SDG 12 - Responsible consumption and production',
-                13:'SDG 13 - Climate action',
-                14:'SDG 14 - Life below water',
-                15:'SDG 15 - Life on land',
-                16:'SDG 16 - Peace, justice and strong institutions',
-                17:'SDG 17 - Partnership for the goals',}
     ### Main app code ###
     with st.container():
         if st.button("RUN SDG Analysis"):
             if 'filepath' in st.session_state:
                 file_name = st.session_state['filename']
                 file_path = st.session_state['filepath']
-                allDocuments = runSDGPreprocessingPipeline(file_path,file_name)
                 if len(allDocuments['documents']) > 100:
                     warning_msg = ": This might take sometime, please sit back and relax."
                 else:
@@ -107,12 +106,15 @@ def app():
                 with st.spinner("Running SDG Classification{}".format(warning_msg)):
-                    df, x = sdg_classification(allDocuments['documents'])
                     sdg_labels = x.SDG.unique()[::-1]
                     textrankkeywordlist = []
                     for label in sdg_labels:
                         sdgdata = " ".join(df[df.SDG == label].text.to_list())
-                        textranklist_ = textrank(sdgdata)
                         if len(textranklist_) > 0:
                             textrankkeywordlist.append({'SDG':label, 'TextRank Keywords':",".join(textranklist_)})
                     tRkeywordsDf = pd.DataFrame(textrankkeywordlist)

 from st_aggrid import AgGrid
 from st_aggrid.shared import ColumnsAutoSizeMode
 from utils.sdg_classifier import sdg_classification
+from utils.sdg_classifier import runSDGPreprocessingPipeline, load_sdgClassifier
+from utils.keyword_extraction import textrank
 import logging
 logger = logging.getLogger(__name__)
+from utils.checkconfig import getconfig
+# Declare all the necessary variables
+config = getconfig('paramconfig.cfg')
+model_name = config.get('sdg','MODEL')
+split_by = config.get('sdg','SPLIT_BY')
+split_length = int(config.get('sdg','SPLIT_LENGTH'))
+split_overlap = int(config.get('sdg','SPLIT_OVERLAP'))
+remove_punc = bool(int(config.get('sdg','REMOVE_PUNC')))
+split_respect_sentence_boundary = bool(int(config.get('sdg','RESPECT_SENTENCE_BOUNDARY')))
+threshold = float(config.get('sdg','THRESHOLD'))
+top_n = int(config.get('sdg','TOP_KEY'))
 def app():
             """)
         st.markdown("")
     ### Main app code ###
     with st.container():
         if st.button("RUN SDG Analysis"):
             if 'filepath' in st.session_state:
                 file_name = st.session_state['filename']
                 file_path = st.session_state['filepath']
+                classifier = load_sdgClassifier(model_name)
+                allDocuments = runSDGPreprocessingPipeline(fileName= file_name,
+                                        filePath= file_path, split_by= split_by,
+                                        split_length= split_length,
+                                        split_overlap= split_overlap,
+                split_respect_sentence_boundary= split_respect_sentence_boundary,
+                removePunc= remove_punc)
                 if len(allDocuments['documents']) > 100:
                     warning_msg = ": This might take sometime, please sit back and relax."
                 else:
                 with st.spinner("Running SDG Classification{}".format(warning_msg)):
+                    df, x = sdg_classification(haystackdoc=allDocuments['documents'],
+                                                threshold= threshold,
+                                                classifiermodel= classifier)
+                    df = df.drop(['Relevancy'], axis = 1)
                     sdg_labels = x.SDG.unique()[::-1]
                     textrankkeywordlist = []
                     for label in sdg_labels:
                         sdgdata = " ".join(df[df.SDG == label].text.to_list())
+                        textranklist_ = textrank(textdata=sdgdata, words= top_n)
                         if len(textranklist_) > 0:
                             textrankkeywordlist.append({'SDG':label, 'TextRank Keywords':",".join(textranklist_)})
                     tRkeywordsDf = pd.DataFrame(textrankkeywordlist)

utils/checkconfig.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import configparser
+import logging
+def getconfig(configFilePath):
+    config = configparser.ConfigParser()
+    try:
+        config.read_file(open(configFilePath))
+        return config
+    except:
+        logging.warning("config file not found")

utils/keyword_extraction.py CHANGED Viewed

@@ -5,25 +5,13 @@ import pandas as pd
 # from nltk.corpus import stopwords
 import pickle
 from typing import List, Text
-import configparser
 import logging
 from summa import keywords
-try:
-    from termcolor import colored
-except:
-    pass
 try:
     import streamlit as st
 except ImportError:
     logging.info("Streamlit not installed")
-config = configparser.ConfigParser()
-try:
-    config.read_file(open('paramconfig.cfg'))
-except Exception:
-    logging.warning("paramconfig file not found")
-    st.info("Please place the paramconfig file in the same directory as app.py")
 def sort_coo(coo_matrix):
@@ -69,6 +57,30 @@ def extract_topn_from_vector(feature_names, sorted_items, top_n=10):
     return results
 def keywordExtraction(sdg:int,sdgdata:List[Text]):
     """
     TFIDF based keywords extraction
@@ -115,12 +127,13 @@ def textrank(textdata:Text, ratio:float = 0.1, words = 0):
     results: extracted keywords
     """
     if words == 0:
-        try:
-            words = int(config.get('sdg','TOP_KEY'))
-            results = keywords.keywords(textdata, words = words).split("\n")
-        except Exception as e:
-            logging.warning(e)
-            results = keywords.keywords(textdata, ratio= ratio).split("\n")
     else:
         try:
             results = keywords.keywords(textdata, words= words).split("\n")

 # from nltk.corpus import stopwords
 import pickle
 from typing import List, Text
 import logging
 from summa import keywords
 try:
     import streamlit as st
 except ImportError:
     logging.info("Streamlit not installed")
 def sort_coo(coo_matrix):
     return results
+def tfidfKeyword(textdata, vectorizer, tfidfmodel, top_n):
+    """
+    TFIDF based keywords extraction
+    Params
+    ---------
+    vectorizer: trained cont vectorizer model
+    tfidfmodel: TFIDF Tranformer model
+    top_n: Top N keywords to be extracted
+    textdata: text data to which needs keyword extraction
+    Return
+    ----------
+    keywords: top extracted keywords
+    """
+    features = vectorizer.get_feature_names_out()
+    tf_idf_vector=tfidfmodel.transform(vectorizer.transform(textdata))
+    sorted_items=sort_coo(tf_idf_vector.tocoo())
+    results=extract_topn_from_vector(features,sorted_items,top_n)
+    keywords = [keyword for keyword in results]
+    return keywords
 def keywordExtraction(sdg:int,sdgdata:List[Text]):
     """
     TFIDF based keywords extraction
     results: extracted keywords
     """
     if words == 0:
+        # try:
+        #     words = int(config.get('sdg','TOP_KEY'))
+        #     results = keywords.keywords(textdata, words = words).split("\n")
+        # except Exception as e:
+        #     logging.warning(e)
+        logging.info("Textrank using defulat ratio value = 0.1, as no words limit given")
+        results = keywords.keywords(textdata, ratio= ratio).split("\n")
     else:
         try:
             results = keywords.keywords(textdata, words= words).split("\n")

utils/preprocessing.py CHANGED Viewed

@@ -179,6 +179,8 @@ class UdfPreProcessor(BaseComponent):
         split_by: document splitting strategy either as word or sentence
         split_length: when synthetically creating the paragrpahs from document,
                       it defines the length of paragraph.
         Return
         ---------

         split_by: document splitting strategy either as word or sentence
         split_length: when synthetically creating the paragrpahs from document,
                       it defines the length of paragraph.
+        split_respect_sentence_boundary: Used when using 'word' strategy for
+        splititng of text.
         Return
         ---------

utils/sdg_classifier.py CHANGED Viewed

@@ -1,63 +1,55 @@
 from haystack.nodes import TransformersDocumentClassifier
 from haystack.schema import Document
-from typing import List, Tuple
 import configparser
 import logging
 import pandas as pd
 from pandas import DataFrame, Series
 from utils.preprocessing import processingpipeline
 try:
     import streamlit as st
 except ImportError:
     logging.info("Streamlit not installed")
-config = configparser.ConfigParser()
-try:
-    config.read_file(open('paramconfig.cfg'))
-except Exception:
-    logging.info("paramconfig file not found")
-    st.info("Please place the paramconfig file in the same directory as app.py")
-_lab_dict = {0: 'no_cat',
-                1:'SDG 1 - No poverty',
-                    2:'SDG 2 - Zero hunger',
-                    3:'SDG 3 - Good health and well-being',
-                    4:'SDG 4 - Quality education',
-                    5:'SDG 5 - Gender equality',
-                    6:'SDG 6 - Clean water and sanitation',
-                    7:'SDG 7 - Affordable and clean energy',
-                    8:'SDG 8 - Decent work and economic growth',
-                    9:'SDG 9 - Industry, Innovation and Infrastructure',
-                    10:'SDG 10 - Reduced inequality',
-                11:'SDG 11 - Sustainable cities and communities',
-                12:'SDG 12 - Responsible consumption and production',
-                13:'SDG 13 - Climate action',
-                14:'SDG 14 - Life below water',
-                15:'SDG 15 - Life on land',
-                16:'SDG 16 - Peace, justice and strong institutions',
-                17:'SDG 17 - Partnership for the goals',}
 @st.cache(allow_output_mutation=True)
-def load_sdgClassifier():
     """
     loads the document classifier using haystack, where the name/path of model
-    in HF-hub as string is used to fetch the model object.
     1. https://docs.haystack.deepset.ai/reference/document-classifier-api
     2. https://docs.haystack.deepset.ai/docs/document_classifier
     Return: document classifier model
     """
-    logging.info("Loading classifier")
-    doc_classifier_model = config.get('sdg','MODEL')
     doc_classifier = TransformersDocumentClassifier(
-        model_name_or_path=doc_classifier_model,
-        task="text-classification")
-    return doc_classifier
 @st.cache(allow_output_mutation=True)
-def sdg_classification(haystackdoc:List[Document])->Tuple[DataFrame,Series]:
     """
     Text-Classification on the list of texts provided. Classifier provides the
     most appropriate label for each text. these labels are in terms of if text
@@ -77,11 +69,7 @@ def sdg_classification(haystackdoc:List[Document])->Tuple[DataFrame,Series]:
     """
     logging.info("Working on SDG Classification")
-    threshold = float(config.get('sdg','THRESHOLD'))
-    classifier = load_sdgClassifier()
-    results = classifier.predict(haystackdoc)
     labels_= [(l.meta['classification']['label'],
@@ -92,6 +80,8 @@ def sdg_classification(haystackdoc:List[Document])->Tuple[DataFrame,Series]:
     df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
     df.index += 1
     df =df[df['Relevancy']>threshold]
     x = df['SDG'].value_counts()
     x = x.rename('count')
     x = x.rename_axis('SDG').reset_index()
@@ -99,14 +89,17 @@ def sdg_classification(haystackdoc:List[Document])->Tuple[DataFrame,Series]:
     x = x.sort_values(by=['count'])
     x['SDG_name'] = x['SDG'].apply(lambda x: _lab_dict[x])
     x['SDG_Num'] = x['SDG'].apply(lambda x: "SDG "+str(x))
-    df= df.drop(['Relevancy'], axis = 1)
     df['SDG'] = pd.to_numeric(df['SDG'])
     df = df.sort_values('SDG')
     return df, x
-def runSDGPreprocessingPipeline(filePath, fileName)->List[Document]:
     """
     creates the pipeline and runs the preprocessing pipeline,
     the params for pipeline are fetched from paramconfig
@@ -117,7 +110,12 @@ def runSDGPreprocessingPipeline(filePath, fileName)->List[Document]:
     file_name: filename, in case of streamlit application use
     st.session_state['filename']
     file_path: filepath, in case of streamlit application use
-    st.session_state['filepath']
     Return
@@ -130,16 +128,11 @@ def runSDGPreprocessingPipeline(filePath, fileName)->List[Document]:
     """
     sdg_processing_pipeline = processingpipeline()
-    split_by = config.get('sdg','SPLIT_BY')
-    split_length = int(config.get('sdg','SPLIT_LENGTH'))
-    split_overlap = int(config.get('sdg','SPLIT_OVERLAP'))
-    remove_punc = bool(int(config.get('sdg','REMOVE_PUNC')))
-    split_respect_sentence_boundary = bool(int(config.get('sdg','RESPECT_SENTENCE_BOUNDARY')))
     output_sdg_pre = sdg_processing_pipeline.run(file_paths = filePath,
                             params= {"FileConverter": {"file_path": filePath, \
                                         "file_name": fileName},
-                                     "UdfPreProcessor": {"removePunc": remove_punc, \
                                             "split_by": split_by, \
                                             "split_length":split_length,\
                                             "split_overlap": split_overlap, \

 from haystack.nodes import TransformersDocumentClassifier
 from haystack.schema import Document
+from typing import List, Tuple, Float
+from typing_extensions import Literal
 import configparser
 import logging
 import pandas as pd
 from pandas import DataFrame, Series
+from utils.checkconfig import getconfig
 from utils.preprocessing import processingpipeline
 try:
     import streamlit as st
 except ImportError:
     logging.info("Streamlit not installed")
 @st.cache(allow_output_mutation=True)
+def load_sdgClassifier(configFile = None, docClassifierModel = None):
     """
     loads the document classifier using haystack, where the name/path of model
+    in HF-hub as string is used to fetch the model object.Either configfile or
+    model should be passed.
     1. https://docs.haystack.deepset.ai/reference/document-classifier-api
     2. https://docs.haystack.deepset.ai/docs/document_classifier
+    Params
+    --------
+    configFile: config file from which to read the model name
+    docClassifierModel: if modelname is passed, it takes a priority if not \
+    found then will look for configfile, else raise error.
     Return: document classifier model
     """
+    if not docClassifierModel:
+        if not configFile:
+            logging.warning("Pass either model name or config file")
+            return
+        else:
+            config = getconfig(configFile)
+            docClassifierModel = config.get('sdg','MODEL')
+    logging.info("Loading classifier")
     doc_classifier = TransformersDocumentClassifier(
+                        model_name_or_path=docClassifierModel,
+                        task="text-classification")
+    return doc_classifier
 @st.cache(allow_output_mutation=True)
+def sdg_classification(haystackdoc:List[Document],
+                        threshold:Float, classifiermodel)->Tuple[DataFrame,Series]:
     """
     Text-Classification on the list of texts provided. Classifier provides the
     most appropriate label for each text. these labels are in terms of if text
     """
     logging.info("Working on SDG Classification")
+    results = classifiermodel.predict(haystackdoc)
     labels_= [(l.meta['classification']['label'],
     df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
     df.index += 1
     df =df[df['Relevancy']>threshold]
+    # creating the dataframe for value counts of SDG, along with 'title' of SDGs
     x = df['SDG'].value_counts()
     x = x.rename('count')
     x = x.rename_axis('SDG').reset_index()
     x = x.sort_values(by=['count'])
     x['SDG_name'] = x['SDG'].apply(lambda x: _lab_dict[x])
     x['SDG_Num'] = x['SDG'].apply(lambda x: "SDG "+str(x))
     df['SDG'] = pd.to_numeric(df['SDG'])
     df = df.sort_values('SDG')
     return df, x
+def runSDGPreprocessingPipeline(filePath, fileName,
+            split_by: Literal["sentence", "word"] = 'sentence',
+            split_respect_sentence_boundary = False,
+            split_length:int = 2, split_overlap = 0,
+            removePunc = False)->List[Document]:
     """
     creates the pipeline and runs the preprocessing pipeline,
     the params for pipeline are fetched from paramconfig
     file_name: filename, in case of streamlit application use
     st.session_state['filename']
     file_path: filepath, in case of streamlit application use
+    removePunc: to remove all Punctuation including ',' and '.' or not
+    split_by: document splitting strategy either as word or sentence
+    split_length: when synthetically creating the paragrpahs from document,
+                    it defines the length of paragraph.
+    split_respect_sentence_boundary: Used when using 'word' strategy for
+    splititng of text.
     Return
     """
     sdg_processing_pipeline = processingpipeline()
     output_sdg_pre = sdg_processing_pipeline.run(file_paths = filePath,
                             params= {"FileConverter": {"file_path": filePath, \
                                         "file_name": fileName},
+                                     "UdfPreProcessor": {"removePunc": removePunc, \
                                             "split_by": split_by, \
                                             "split_length":split_length,\
                                             "split_overlap": split_overlap, \