Spaces:
				
			
			
	
			
			
		Running
		
			on 
			
			Zero
	
	
	
			
			
	
	
	
	
		
		
		Running
		
			on 
			
			Zero
	
		Sean-Case
		
	commited on
		
		
					Commit 
							
							·
						
						fac3624
	
1
								Parent(s):
							
							aa3df37
								
Greatly increased low resource process dimensions for higher quality. Visualisations disabled by default to increase speed.
Browse files- app.py +3 -3
- funcs/embeddings.py +1 -1
- funcs/representation_model.py +1 -1
    	
        app.py
    CHANGED
    
    | @@ -295,7 +295,7 @@ with block: | |
| 295 | 
             
                        candidate_topics = gr.File(label="Input topics from file (csv). File should have at least one column with a header and topic keywords in cells below. Topics will be taken from the first column of the file. Currently not compatible with low-resource embeddings.")
         | 
| 296 |  | 
| 297 | 
             
                    with gr.Row():
         | 
| 298 | 
            -
                        min_docs_slider = gr.Slider(minimum = 2, maximum = 1000, value = 15, step = 1, label = "Minimum number of documents  | 
| 299 | 
             
                        max_topics_slider = gr.Slider(minimum = 2, maximum = 500, value = 3, step = 1, label = "Maximum number of topics")
         | 
| 300 |  | 
| 301 | 
             
                    with gr.Row():
         | 
| @@ -305,7 +305,7 @@ with block: | |
| 305 | 
             
                        output_single_text = gr.Textbox(label="Output example (first example in dataset)")
         | 
| 306 | 
             
                        output_file = gr.File(label="Output file")
         | 
| 307 |  | 
| 308 | 
            -
                    plot = gr.Plot(label="Visualise your topics here | 
| 309 |  | 
| 310 | 
             
                with gr.Tab("Options"):
         | 
| 311 | 
             
                    with gr.Accordion("Data load and processing options", open = True):
         | 
| @@ -317,7 +317,7 @@ with block: | |
| 317 | 
             
                            low_resource_mode_opt = gr.Dropdown(label = "Use low resource embeddings and processing.", value="No", choices=["Yes", "No"])
         | 
| 318 | 
             
                            create_llm_topic_labels = gr.Dropdown(label = "Create LLM-generated topic labels.", value="No", choices=["Yes", "No"])
         | 
| 319 | 
             
                            save_topic_model = gr.Dropdown(label = "Save topic model to file.", value="Yes", choices=["Yes", "No"])
         | 
| 320 | 
            -
                            visualise_topics = gr.Dropdown(label = "Create a visualisation to map topics.", value=" | 
| 321 |  | 
| 322 | 
             
                # Update column names dropdown when file uploaded
         | 
| 323 | 
             
                in_files.upload(fn=put_columns_in_df, inputs=[in_files], outputs=[in_colnames, in_label, data_state])    
         | 
|  | |
| 295 | 
             
                        candidate_topics = gr.File(label="Input topics from file (csv). File should have at least one column with a header and topic keywords in cells below. Topics will be taken from the first column of the file. Currently not compatible with low-resource embeddings.")
         | 
| 296 |  | 
| 297 | 
             
                    with gr.Row():
         | 
| 298 | 
            +
                        min_docs_slider = gr.Slider(minimum = 2, maximum = 1000, value = 15, step = 1, label = "Minimum number of similar documents needed to make a topic.")
         | 
| 299 | 
             
                        max_topics_slider = gr.Slider(minimum = 2, maximum = 500, value = 3, step = 1, label = "Maximum number of topics")
         | 
| 300 |  | 
| 301 | 
             
                    with gr.Row():
         | 
|  | |
| 305 | 
             
                        output_single_text = gr.Textbox(label="Output example (first example in dataset)")
         | 
| 306 | 
             
                        output_file = gr.File(label="Output file")
         | 
| 307 |  | 
| 308 | 
            +
                    plot = gr.Plot(label="Visualise your topics here. Go to the 'Options' tab to enable.")
         | 
| 309 |  | 
| 310 | 
             
                with gr.Tab("Options"):
         | 
| 311 | 
             
                    with gr.Accordion("Data load and processing options", open = True):
         | 
|  | |
| 317 | 
             
                            low_resource_mode_opt = gr.Dropdown(label = "Use low resource embeddings and processing.", value="No", choices=["Yes", "No"])
         | 
| 318 | 
             
                            create_llm_topic_labels = gr.Dropdown(label = "Create LLM-generated topic labels.", value="No", choices=["Yes", "No"])
         | 
| 319 | 
             
                            save_topic_model = gr.Dropdown(label = "Save topic model to file.", value="Yes", choices=["Yes", "No"])
         | 
| 320 | 
            +
                            visualise_topics = gr.Dropdown(label = "Create a visualisation to map topics.", value="No", choices=["Yes", "No"])
         | 
| 321 |  | 
| 322 | 
             
                # Update column names dropdown when file uploaded
         | 
| 323 | 
             
                in_files.upload(fn=put_columns_in_df, inputs=[in_files], outputs=[in_colnames, in_label, data_state])    
         | 
    	
        funcs/embeddings.py
    CHANGED
    
    | @@ -35,7 +35,7 @@ def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_mo | |
| 35 | 
             
                        print("Creating simplified 'sparse' embeddings based on TfIDF")
         | 
| 36 | 
             
                        embedding_model = make_pipeline(
         | 
| 37 | 
             
                        TfidfVectorizer(),
         | 
| 38 | 
            -
                        TruncatedSVD( | 
| 39 | 
             
                        )
         | 
| 40 |  | 
| 41 | 
             
                        # Fit the pipeline to the text data
         | 
|  | |
| 35 | 
             
                        print("Creating simplified 'sparse' embeddings based on TfIDF")
         | 
| 36 | 
             
                        embedding_model = make_pipeline(
         | 
| 37 | 
             
                        TfidfVectorizer(),
         | 
| 38 | 
            +
                        TruncatedSVD(2000, random_state=random_seed)
         | 
| 39 | 
             
                        )
         | 
| 40 |  | 
| 41 | 
             
                        # Fit the pipeline to the text data
         | 
    	
        funcs/representation_model.py
    CHANGED
    
    | @@ -119,7 +119,7 @@ llm_config = LLamacppInitConfigGpu(last_n_tokens_size=last_n_tokens_size, | |
| 119 | 
             
            # KeyBERT
         | 
| 120 | 
             
            keybert = KeyBERTInspired(random_state=random_seed)
         | 
| 121 | 
             
            # MMR
         | 
| 122 | 
            -
            mmr = MaximalMarginalRelevance(diversity=0. | 
| 123 |  | 
| 124 | 
             
            def create_representation_model(create_llm_topic_labels, llm_config, hf_model_name, hf_model_file, chosen_start_tag, low_resource_mode):
         | 
| 125 |  | 
|  | |
| 119 | 
             
            # KeyBERT
         | 
| 120 | 
             
            keybert = KeyBERTInspired(random_state=random_seed)
         | 
| 121 | 
             
            # MMR
         | 
| 122 | 
            +
            mmr = MaximalMarginalRelevance(diversity=0.2)
         | 
| 123 |  | 
| 124 | 
             
            def create_representation_model(create_llm_topic_labels, llm_config, hf_model_name, hf_model_file, chosen_start_tag, low_resource_mode):
         | 
| 125 |  |