Spaces:

Adapting
/

TrendFlow

Runtime error

App Files Files Community

Adapting commited on Nov 2, 2022

Commit

b17c1e6

1 Parent(s): 675214a

updates

Browse files

Files changed (5) hide show

app.py +4 -1
lrt/clustering/clustering_pipeline.py +14 -3
lrt/lrt.py +8 -6
widgets/body.py +2 -2
widgets/sidebar.py +6 -4

app.py CHANGED Viewed

@@ -25,7 +25,10 @@ with st.form("my_form",clear_on_submit=False):
 if submitted:
     # body
-    render_body(platforms, number_papers, 5, query_input, show_preview, start_year, end_year, hyperparams)
     # '''
     # bar = (
     #     Bar()

 if submitted:
     # body
+    render_body(platforms, number_papers, 5, query_input,
+                show_preview, start_year, end_year,
+                hyperparams,
+                hyperparams['standardization'])
     # '''
     # bar = (
     #     Bar()

lrt/clustering/clustering_pipeline.py CHANGED Viewed

@@ -3,6 +3,7 @@ from .config import BaselineConfig, Configuration
 from ..utils import __create_model__
 import numpy as np
 from sklearn.cluster import KMeans
 from yellowbrick.cluster import KElbowVisualizer
 from .clusters import ClusterList
@@ -42,7 +43,7 @@ class ClusterPipeline:
         print(f'>>> finished dimension reduction...')
         return embeddings
-    def __3_clustering__(self, embeddings, return_cluster_centers = False, max_k: int =10):
         '''
         :param embeddings: Nxd
@@ -52,6 +53,16 @@ class ClusterPipeline:
             return embeddings
         else:
             print(f'>>> start clustering...')
             model = KMeans()
             visualizer = KElbowVisualizer(
                 model, k=(2, max_k+1), metric='silhouette', timings=False, locate_elbow=False
@@ -93,11 +104,11 @@ class ClusterPipeline:
             return clusters
-    def __call__(self, documents: List[str], max_k:int):
         print(f'>>> pipeline starts...')
         x = self.__1_generate_word_embeddings__(documents)
         x = self.__2_dimenstion_reduction__(x)
-        clusters = self.__3_clustering__(x,max_k=max_k)
         outputs = self.__4_keywords_extraction__(clusters, documents)
         print(f'>>> pipeline finished!\n')
         return outputs

 from ..utils import __create_model__
 import numpy as np
 from sklearn.cluster import KMeans
+from sklearn.preprocessing import StandardScaler
 from yellowbrick.cluster import KElbowVisualizer
 from .clusters import ClusterList
         print(f'>>> finished dimension reduction...')
         return embeddings
+    def __3_clustering__(self, embeddings, return_cluster_centers = False, max_k: int =10, standarization = False):
         '''
         :param embeddings: Nxd
             return embeddings
         else:
             print(f'>>> start clustering...')
+            ######## new: standarization ########
+            if standarization:
+                print(f'>>> start standardization...')
+                scaler = StandardScaler()
+                embeddings = scaler.fit_transform(embeddings)
+                print(f'>>> finished standardization...')
+            ######## new: standarization ########
             model = KMeans()
             visualizer = KElbowVisualizer(
                 model, k=(2, max_k+1), metric='silhouette', timings=False, locate_elbow=False
             return clusters
+    def __call__(self, documents: List[str], max_k:int, standarization = False):
         print(f'>>> pipeline starts...')
         x = self.__1_generate_word_embeddings__(documents)
         x = self.__2_dimenstion_reduction__(x)
+        clusters = self.__3_clustering__(x,max_k=max_k,standarization=standarization)
         outputs = self.__4_keywords_extraction__(clusters, documents)
         print(f'>>> pipeline finished!\n')
         return outputs

lrt/lrt.py CHANGED Viewed

@@ -49,15 +49,16 @@ class LiteratureResearchTool:
                  max_k: int,
                  platforms: List[str] = ['IEEE', 'Arxiv', 'Paper with Code'],
                  loading_ctx_manager = None,
                  ):
         for platform in platforms:
             if loading_ctx_manager:
                 with loading_ctx_manager():
-                    clusters, articles = self.__platformPipeline__(platform,query,num_papers,start_year,end_year,max_k)
             else:
-                clusters, articles = self.__platformPipeline__(platform, query, num_papers, start_year, end_year,max_k)
             clusters.sort()
             yield clusters,articles
@@ -69,7 +70,8 @@ class LiteratureResearchTool:
                              num_papers: int,
                              start_year: int,
                              end_year: int,
-                             max_k: int
                              ) -> (ClusterList,ArticleList):
         @st.cache(hash_funcs={Tokenizer: Tokenizer.__hash__},allow_output_mutation=True)
@@ -82,7 +84,7 @@ class LiteratureResearchTool:
             articles = ArticleList.parse_ieee_articles(
             self.literature_search.ieee(query, start_year, end_year, num_papers))  # ArticleList
             abstracts = articles.getAbstracts()  # List[str]
-            clusters = self.cluster_pipeline(abstracts,max_k)
             clusters = self.__postprocess_clusters__(clusters)
             return clusters, articles
@@ -94,7 +96,7 @@ class LiteratureResearchTool:
             articles = ArticleList.parse_arxiv_articles(
             self.literature_search.arxiv(query, num_papers))  # ArticleList
             abstracts = articles.getAbstracts()  # List[str]
-            clusters = self.cluster_pipeline(abstracts,max_k)
             clusters = self.__postprocess_clusters__(clusters)
             return clusters, articles
@@ -106,7 +108,7 @@ class LiteratureResearchTool:
             articles = ArticleList.parse_pwc_articles(
             self.literature_search.paper_with_code(query, num_papers))  # ArticleList
             abstracts = articles.getAbstracts()  # List[str]
-            clusters = self.cluster_pipeline(abstracts,max_k)
             clusters = self.__postprocess_clusters__(clusters)
             return clusters, articles

                  max_k: int,
                  platforms: List[str] = ['IEEE', 'Arxiv', 'Paper with Code'],
                  loading_ctx_manager = None,
+                 standardization = False
                  ):
         for platform in platforms:
             if loading_ctx_manager:
                 with loading_ctx_manager():
+                    clusters, articles = self.__platformPipeline__(platform,query,num_papers,start_year,end_year,max_k,standardization)
             else:
+                clusters, articles = self.__platformPipeline__(platform, query, num_papers, start_year, end_year,max_k,standardization)
             clusters.sort()
             yield clusters,articles
                              num_papers: int,
                              start_year: int,
                              end_year: int,
+                             max_k: int,
+                             standardization
                              ) -> (ClusterList,ArticleList):
         @st.cache(hash_funcs={Tokenizer: Tokenizer.__hash__},allow_output_mutation=True)
             articles = ArticleList.parse_ieee_articles(
             self.literature_search.ieee(query, start_year, end_year, num_papers))  # ArticleList
             abstracts = articles.getAbstracts()  # List[str]
+            clusters = self.cluster_pipeline(abstracts,max_k,standardization)
             clusters = self.__postprocess_clusters__(clusters)
             return clusters, articles
             articles = ArticleList.parse_arxiv_articles(
             self.literature_search.arxiv(query, num_papers))  # ArticleList
             abstracts = articles.getAbstracts()  # List[str]
+            clusters = self.cluster_pipeline(abstracts,max_k,standardization)
             clusters = self.__postprocess_clusters__(clusters)
             return clusters, articles
             articles = ArticleList.parse_pwc_articles(
             self.literature_search.paper_with_code(query, num_papers))  # ArticleList
             abstracts = articles.getAbstracts()  # List[str]
+            clusters = self.cluster_pipeline(abstracts,max_k,standardization)
             clusters = self.__postprocess_clusters__(clusters)
             return clusters, articles

widgets/body.py CHANGED Viewed

@@ -55,7 +55,7 @@ We have found following papers for you! (displaying 5 papers for each literature
         paperInGeneral.markdown(paperInGeneral_md)
-def render_body(platforms, num_papers, num_papers_preview, query_input, show_preview:bool, start_year, end_year, hyperparams: dict):
     tmp = st.empty()
     if query_input != '':
@@ -79,7 +79,7 @@ def render_body(platforms, num_papers, num_papers_preview, query_input, show_pre
             )
             model = LiteratureResearchTool(config)
-        generator =  model(query_input, num_papers, start_year, end_year, max_k=hyperparams['max_k'], platforms=platforms)
         for i,plat in enumerate(platforms):
             clusters, articles = next(generator)
             st.markdown(f'''# {i+1} {plat} Results''')

         paperInGeneral.markdown(paperInGeneral_md)
+def render_body(platforms, num_papers, num_papers_preview, query_input, show_preview:bool, start_year, end_year, hyperparams: dict, standardization = False):
     tmp = st.empty()
     if query_input != '':
             )
             model = LiteratureResearchTool(config)
+        generator =  model(query_input, num_papers, start_year, end_year, max_k=hyperparams['max_k'], platforms=platforms, standardization=standardization)
         for i,plat in enumerate(platforms):
             clusters, articles = next(generator)
             st.markdown(f'''# {i+1} {plat} Results''')

widgets/sidebar.py CHANGED Viewed

@@ -3,7 +3,7 @@ import datetime
 # from .utils import PACKAGE_ROOT
 from lrt.utils.functions import template
-APP_VERSION = 'v1.2.0'
 def render_sidebar():
     icons = f'''
@@ -70,9 +70,10 @@ def render_sidebar():
     with st.sidebar:
         st.markdown('## Adjust hyperparameters')
         with st.expander('Clustering Options'):
-            dr = st.selectbox('1) Dimension Reduction', options=['none', 'pca'], index=0)
             tmp = min(number_papers,15)
-            max_k = st.slider('2) Max number of clusters', 2,tmp , tmp//2)
         with st.expander('Keyphrases Generation Options'):
             model_cpt = st.selectbox(label='Model checkpoint', options=template.keywords_extraction.keys(),index=0)
@@ -88,5 +89,6 @@ def render_sidebar():
     return platforms, number_papers, start_year, end_year, dict(
         dimension_reduction= dr,
         max_k = max_k,
-        model_cpt = model_cpt
     )

 # from .utils import PACKAGE_ROOT
 from lrt.utils.functions import template
+APP_VERSION = 'v1.3.0'
 def render_sidebar():
     icons = f'''
     with st.sidebar:
         st.markdown('## Adjust hyperparameters')
         with st.expander('Clustering Options'):
+            standardization = st.selectbox('1) Standardization before clustering', options=['no', 'yes'], index=0 )
+            dr = st.selectbox('2) Dimension reduction', options=['none', 'pca'], index=0)
             tmp = min(number_papers,15)
+            max_k = st.slider('3) Max number of clusters', 2,tmp , tmp//2)
         with st.expander('Keyphrases Generation Options'):
             model_cpt = st.selectbox(label='Model checkpoint', options=template.keywords_extraction.keys(),index=0)
     return platforms, number_papers, start_year, end_year, dict(
         dimension_reduction= dr,
         max_k = max_k,
+        model_cpt = model_cpt,
+        standardization = True if standardization == 'yes' else False
     )