Spaces:
				
			
			
	
			
			
		Running
		
			on 
			
			Zero
	
	
	
			
			
	
	
	
	
		
		
		Running
		
			on 
			
			Zero
	
		Sean-Case
		
	commited on
		
		
					Commit 
							
							·
						
						e0f53cc
	
1
								Parent(s):
							
							ff32b4a
								
Now should save embeddings by default. Added random seed to representation
Browse files- app.py +9 -1
- funcs/representation_model.py +2 -2
    	
        app.py
    CHANGED
    
    | @@ -242,7 +242,15 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s | |
| 242 | 
             
                zip_folder(topic_model_save_name_folder, topic_model_save_name_zip)
         | 
| 243 | 
             
                output_list.append(topic_model_save_name_zip)
         | 
| 244 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 245 | 
             
                # Visualise the topics:
         | 
|  | |
| 246 | 
             
                topics_vis = topic_model.visualize_documents(label_col, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)
         | 
| 247 |  | 
| 248 | 
             
                return output_text, output_list, topics_vis
         | 
| @@ -290,7 +298,7 @@ with block: | |
| 290 | 
             
                    with gr.Accordion("Data load and processing options", open = True):
         | 
| 291 | 
             
                        with gr.Row():
         | 
| 292 | 
             
                            anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Names and other details are replaced with tags e.g. '<person>'.")
         | 
| 293 | 
            -
                            return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value=" | 
| 294 | 
             
                            embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="No", choices=["Yes", "No"])
         | 
| 295 | 
             
                        with gr.Row():
         | 
| 296 | 
             
                            low_resource_mode_opt = gr.Dropdown(label = "Use low resource embeddings model based on TF-IDF (consider if embedding generation is slow).", value="No", choices=["Yes", "No"])
         | 
|  | |
| 242 | 
             
                zip_folder(topic_model_save_name_folder, topic_model_save_name_zip)
         | 
| 243 | 
             
                output_list.append(topic_model_save_name_zip)
         | 
| 244 |  | 
| 245 | 
            +
                if return_intermediate_files == "Yes":
         | 
| 246 | 
            +
                    print("Saving embeddings to file")
         | 
| 247 | 
            +
                    semantic_search_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
         | 
| 248 | 
            +
                    np.savez_compressed(semantic_search_file_name, embeddings_out)
         | 
| 249 | 
            +
             | 
| 250 | 
            +
                    output_list.append(semantic_search_file_name)
         | 
| 251 | 
            +
             | 
| 252 | 
             
                # Visualise the topics:
         | 
| 253 | 
            +
                print("Creating visualisation")
         | 
| 254 | 
             
                topics_vis = topic_model.visualize_documents(label_col, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)
         | 
| 255 |  | 
| 256 | 
             
                return output_text, output_list, topics_vis
         | 
|  | |
| 298 | 
             
                    with gr.Accordion("Data load and processing options", open = True):
         | 
| 299 | 
             
                        with gr.Row():
         | 
| 300 | 
             
                            anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Names and other details are replaced with tags e.g. '<person>'.")
         | 
| 301 | 
            +
                            return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="Yes", choices=["Yes", "No"])
         | 
| 302 | 
             
                            embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="No", choices=["Yes", "No"])
         | 
| 303 | 
             
                        with gr.Row():
         | 
| 304 | 
             
                            low_resource_mode_opt = gr.Dropdown(label = "Use low resource embeddings model based on TF-IDF (consider if embedding generation is slow).", value="No", choices=["Yes", "No"])
         | 
    	
        funcs/representation_model.py
    CHANGED
    
    | @@ -9,7 +9,7 @@ import torch.cuda | |
| 9 | 
             
            from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration
         | 
| 10 | 
             
            from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
         | 
| 11 |  | 
| 12 | 
            -
             | 
| 13 |  | 
| 14 | 
             
            chosen_prompt = open_hermes_prompt # stablelm_prompt 
         | 
| 15 | 
             
            chosen_start_tag =  open_hermes_start # stablelm_start
         | 
| @@ -117,7 +117,7 @@ llm_config = LLamacppInitConfigGpu(last_n_tokens_size=last_n_tokens_size, | |
| 117 |  | 
| 118 | 
             
            ## Create representation model parameters ##
         | 
| 119 | 
             
            # KeyBERT
         | 
| 120 | 
            -
            keybert = KeyBERTInspired()
         | 
| 121 |  | 
| 122 | 
             
            def create_representation_model(create_llm_topic_labels, llm_config, hf_model_name, hf_model_file, chosen_start_tag):
         | 
| 123 |  | 
|  | |
| 9 | 
             
            from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration
         | 
| 10 | 
             
            from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
         | 
| 11 |  | 
| 12 | 
            +
            random_seed = 42
         | 
| 13 |  | 
| 14 | 
             
            chosen_prompt = open_hermes_prompt # stablelm_prompt 
         | 
| 15 | 
             
            chosen_start_tag =  open_hermes_start # stablelm_start
         | 
|  | |
| 117 |  | 
| 118 | 
             
            ## Create representation model parameters ##
         | 
| 119 | 
             
            # KeyBERT
         | 
| 120 | 
            +
            keybert = KeyBERTInspired(random_state=random_seed)
         | 
| 121 |  | 
| 122 | 
             
            def create_representation_model(create_llm_topic_labels, llm_config, hf_model_name, hf_model_file, chosen_start_tag):
         | 
| 123 |  |