added lemma occurences to nearest neighbours function
Browse files- app.py +5 -0
- word2vec.py +7 -2
    	
        app.py
    CHANGED
    
    | @@ -136,7 +136,12 @@ if active_tab == "Nearest neighbours": | |
| 136 | 
             
                                nearest_neighbours[model],
         | 
| 137 | 
             
                                columns = ['Word', 'Cosine Similarity']
         | 
| 138 | 
             
                            )
         | 
|  | |
|  | |
|  | |
|  | |
| 139 |  | 
|  | |
| 140 | 
             
                            all_dfs.append((model, df))
         | 
| 141 | 
             
                            st.table(df)        
         | 
| 142 |  | 
|  | |
| 136 | 
             
                                nearest_neighbours[model],
         | 
| 137 | 
             
                                columns = ['Word', 'Cosine Similarity']
         | 
| 138 | 
             
                            )
         | 
| 139 | 
            +
                            
         | 
| 140 | 
            +
                            # Add word occurences to dataframe
         | 
| 141 | 
            +
                            df['Occurences'] = df['Word'].apply(lambda x: lemma_counts[model][x])
         | 
| 142 | 
            +
             | 
| 143 |  | 
| 144 | 
            +
                            
         | 
| 145 | 
             
                            all_dfs.append((model, df))
         | 
| 146 | 
             
                            st.table(df)        
         | 
| 147 |  | 
    	
        word2vec.py
    CHANGED
    
    | @@ -464,13 +464,18 @@ def count_lemmas(directory): | |
| 464 | 
             
                """
         | 
| 465 | 
             
                lemma_count_dict = {}
         | 
| 466 | 
             
                for file in os.listdir(directory):
         | 
|  | |
|  | |
|  | |
|  | |
| 467 | 
             
                    if file.endswith(".txt"):
         | 
| 468 | 
             
                        with open(os.path.join(directory, file), 'r', encoding='utf-8') as f:
         | 
| 469 | 
             
                            text = f.read()
         | 
| 470 | 
             
                            words = text.split()
         | 
| 471 | 
            -
                            lemma_count_dict[ | 
| 472 |  | 
| 473 | 
             
                return lemma_count_dict
         | 
|  | |
| 474 |  | 
| 475 |  | 
| 476 |  | 
| @@ -497,7 +502,7 @@ def main(): | |
| 497 | 
             
                # Iterate over all words and print their vectors
         | 
| 498 | 
             
                # iterate_over_words(model)
         | 
| 499 |  | 
| 500 | 
            -
                count_lemmas('lemma_list_raw')
         | 
| 501 |  | 
| 502 |  | 
| 503 | 
             
            if __name__ == "__main__":
         | 
|  | |
| 464 | 
             
                """
         | 
| 465 | 
             
                lemma_count_dict = {}
         | 
| 466 | 
             
                for file in os.listdir(directory):
         | 
| 467 | 
            +
                    model_name = file.split('.')[0].replace('_', ' ').capitalize()
         | 
| 468 | 
            +
                    if len(model_name.split()) == 2:
         | 
| 469 | 
            +
                        # Also capitalize second part of model name
         | 
| 470 | 
            +
                        model_name = ' '.join([word.capitalize() for word in model_name.split()])
         | 
| 471 | 
             
                    if file.endswith(".txt"):
         | 
| 472 | 
             
                        with open(os.path.join(directory, file), 'r', encoding='utf-8') as f:
         | 
| 473 | 
             
                            text = f.read()
         | 
| 474 | 
             
                            words = text.split()
         | 
| 475 | 
            +
                            lemma_count_dict[model_name] = Counter(words)
         | 
| 476 |  | 
| 477 | 
             
                return lemma_count_dict
         | 
| 478 | 
            +
             | 
| 479 |  | 
| 480 |  | 
| 481 |  | 
|  | |
| 502 | 
             
                # Iterate over all words and print their vectors
         | 
| 503 | 
             
                # iterate_over_words(model)
         | 
| 504 |  | 
| 505 | 
            +
                print(count_lemmas('lemma_list_raw'))
         | 
| 506 |  | 
| 507 |  | 
| 508 | 
             
            if __name__ == "__main__":
         | 
