Spaces:

seanpedrickcase
/

data_text_search

Sleeping

App Files Files Community

seanpedrickcase commited on Feb 15, 2024

Commit

36a404e

1 Parent(s): 2bcd818

Added highlight search term functionality to keyword search output

Browse files

Files changed (6) hide show

.gitignore +1 -0
app.py +12 -4
example_highlight.txt +10 -0
requirements.txt +2 -1
search_funcs/bm25_functions.py +5 -2
search_funcs/helper_functions.py +115 -3

.gitignore CHANGED Viewed

@@ -15,6 +15,7 @@
 *.npz
 *.pkl
 *.pkl.gz
 build/*
 dist/*
 __pycache__/*

 *.npz
 *.pkl
 *.pkl.gz
+*.pem
 build/*
 dist/*
 __pycache__/*

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ PandasDataFrame = Type[pd.DataFrame]
 from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
 from search_funcs.semantic_ingest_functions import csv_excel_text_to_docs
 from search_funcs.semantic_functions import docs_to_bge_embed_np_array, bge_simple_retrieval
-from search_funcs.helper_functions import display_info, initial_data_load, put_columns_in_join_df, get_temp_folder_path, empty_folder
 from search_funcs.spacy_search_funcs import spacy_fuzzy_search
 # Attempt to delete temporary files generated by previous use of the app (as the files can be very big!)
@@ -157,7 +157,7 @@ depends on factors such as the type of documents or queries. Information taken f
     ### BM25 SEARCH ###
     # Update dropdowns upon initial file load
-    in_bm25_file.upload(initial_data_load, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, search_df_join_column, keyword_data_state, search_index_state, embeddings_state, tokenised_state, load_finished_message, current_source])
     in_join_file.upload(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
     # Load in BM25 data
@@ -174,7 +174,7 @@ depends on factors such as the type of documents or queries. Information taken f
     ### SEMANTIC SEARCH ###
     # Load in a csv/excel file for semantic search
-    in_semantic_file.upload(initial_data_load, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column,  search_df_join_column, semantic_data_state, search_index_state, embeddings_state, tokenised_state, semantic_load_progress, current_source_semantic])
     load_semantic_data_button.click(
         csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress]).\
         then(docs_to_bge_embed_np_array, inputs=[ingest_docs, in_semantic_file, embeddings_state, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file])
@@ -183,5 +183,13 @@ depends on factors such as the type of documents or queries. Information taken f
     semantic_submit.click(bge_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
     semantic_query.submit(bge_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file])
-block.queue().launch(debug=True)

 from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
 from search_funcs.semantic_ingest_functions import csv_excel_text_to_docs
 from search_funcs.semantic_functions import docs_to_bge_embed_np_array, bge_simple_retrieval
+from search_funcs.helper_functions import display_info, initial_data_load, put_columns_in_join_df, get_temp_folder_path, empty_folder
 from search_funcs.spacy_search_funcs import spacy_fuzzy_search
 # Attempt to delete temporary files generated by previous use of the app (as the files can be very big!)
     ### BM25 SEARCH ###
     # Update dropdowns upon initial file load
+    in_bm25_file.upload(initial_data_load, inputs=[in_bm25_file], outputs=[in_bm25_column, search_df_join_column, keyword_data_state, search_index_state, embeddings_state, tokenised_state, load_finished_message, current_source])
     in_join_file.upload(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
     # Load in BM25 data
     ### SEMANTIC SEARCH ###
     # Load in a csv/excel file for semantic search
+    in_semantic_file.upload(initial_data_load, inputs=[in_semantic_file], outputs=[in_semantic_column,  search_df_join_column, semantic_data_state, search_index_state, embeddings_state, tokenised_state, semantic_load_progress, current_source_semantic])
     load_semantic_data_button.click(
         csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress]).\
         then(docs_to_bge_embed_np_array, inputs=[ingest_docs, in_semantic_file, embeddings_state, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file])
     semantic_submit.click(bge_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
     semantic_query.submit(bge_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file])
+# Simple run for HF spaces or local on your computer
+block.queue().launch(debug=True)
+# Running on local server without https
+#block.queue().launch(server_name="0.0.0.0", server_port=7861, ssl_verify=False)
+# Running on local server with https: https://discuss.huggingface.co/t/how-to-run-gradio-with-0-0-0-0-and-https/38003 or https://dev.to/rajshirolkar/fastapi-over-https-for-development-on-windows-2p7d # Need to download OpenSSL and create own keys
+# block.queue().launch(ssl_verify=False, share=False, debug=False, server_name="0.0.0.0",server_port=443,
+#                      ssl_certfile="cert.pem", ssl_keyfile="key.pem") # port 443 for https. Certificates currently not valid

example_highlight.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+# Sample DataFrame
+data = {
+    'Column1': ['This is a specific substring example', 'Another example', 'One more'],
+    'Column2': ['Some data', 'Another data', 'More data']
+}
+df = pd.DataFrame(data)
+# Define the column to highlight and the substrings to highlight
+column_to_highlight = 'Column1'
+substrings_to_highlight = ['specific', 'example']

requirements.txt CHANGED Viewed

@@ -8,4 +8,5 @@ torch==2.1.2
 spacy==3.7.2
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
 gradio==4.16.0
-sentence_transformers==2.3.1

 spacy==3.7.2
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
 gradio==4.16.0
+sentence_transformers==2.3.1
+lxml==5.1.0

search_funcs/bm25_functions.py CHANGED Viewed

@@ -14,7 +14,7 @@ from datetime import datetime
 today_rev = datetime.now().strftime("%Y%m%d")
 from search_funcs.clean_funcs import initial_clean # get_lemma_tokens, stem_sentence
-from search_funcs.helper_functions import get_file_path_end_with_ext, get_file_path_end
 # Load the SpaCy model
 from spacy.cli.download import download
@@ -517,7 +517,10 @@ def bm25_search(free_text_query, in_no_search_results, original_data, text_colum
 	print("Saving search file output")
 	progress(0.7, desc = "Saving search output to file")
-	results_df_out.to_excel(results_df_name, index= None)
 	results_first_text = results_df_out[text_column].iloc[0]
 	print("Returning results")

 today_rev = datetime.now().strftime("%Y%m%d")
 from search_funcs.clean_funcs import initial_clean # get_lemma_tokens, stem_sentence
+from search_funcs.helper_functions import get_file_path_end_with_ext, get_file_path_end, create_highlighted_excel_wb
 # Load the SpaCy model
 from spacy.cli.download import download
 	print("Saving search file output")
 	progress(0.7, desc = "Saving search output to file")
+	# Highlight found text and save to file
+	results_df_out_wb = create_highlighted_excel_wb(results_df_out, free_text_query, "search_text")
+	results_df_out_wb.save(results_df_name)
+	#results_df_out.to_excel(results_df_name, index= None)
 	results_first_text = results_df_out[text_column].iloc[0]
 	print("Returning results")

search_funcs/helper_functions.py CHANGED Viewed

@@ -9,6 +9,12 @@ import gzip
 import pickle
 import numpy as np
 # Attempt to delete content of gradio temp folder
 def get_temp_folder_path():
     username = getpass.getuser()
@@ -86,7 +92,7 @@ def read_file(filename):
     return file
-def initial_data_load(in_file, in_bm25_column):
     '''
     When file is loaded, update the column dropdown choices
     '''
@@ -107,7 +113,7 @@ def initial_data_load(in_file, in_bm25_column):
     if not data_file_names:
         out_message = "Please load in at least one csv/Excel/parquet data file."
         print(out_message)
-        return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), pd.DataFrame(), bm25_load, out_message
     data_file_name = data_file_names[0]
@@ -179,7 +185,7 @@ def put_columns_in_join_df(in_file):
     return gr.Dropdown(choices=concat_choices), new_df, out_message
-def dummy_function(gradio_component):
     """
     A dummy function that exists just so that dropdown updates work correctly.
     """
@@ -188,3 +194,109 @@ def dummy_function(gradio_component):
 def display_info(info_component):
     gr.Info(info_component)

 import pickle
 import numpy as np
+# Openpyxl functions for output
+from openpyxl import Workbook
+from openpyxl.cell.text import InlineFont
+from openpyxl.cell.rich_text import TextBlock, CellRichText
+from openpyxl.styles import Font
 # Attempt to delete content of gradio temp folder
 def get_temp_folder_path():
     username = getpass.getuser()
     return file
+def initial_data_load(in_file):
     '''
     When file is loaded, update the column dropdown choices
     '''
     if not data_file_names:
         out_message = "Please load in at least one csv/Excel/parquet data file."
         print(out_message)
+        return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), pd.DataFrame(), index_load, out_message
     data_file_name = data_file_names[0]
     return gr.Dropdown(choices=concat_choices), new_df, out_message
     """
     A dummy function that exists just so that dropdown updates work correctly.
     """
 def display_info(info_component):
     gr.Info(info_component)
+def highlight_found_text(search_text: str, full_text: str) -> str:
+    """
+    Highlights occurrences of search_text within full_text.
+    Parameters:
+    - search_text (str): The text to be searched for within full_text.
+    - full_text (str): The text within which search_text occurrences will be highlighted.
+    Returns:
+    - str: A string with occurrences of search_text highlighted.
+    Example:
+    >>> highlight_found_text("world", "Hello, world! This is a test. Another world awaits.")
+    'Hello, <mark style="color:black;">world</mark>! This is a test. Another <mark style="color:black;">world</mark> awaits.'
+    """
+    def extract_text_from_input(text, i=0):
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, list):
+            return text[i][0]
+        else:
+            return ""
+    def extract_search_text_from_input(text):
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, list):
+            return text[-1][1]
+        else:
+            return ""
+    full_text = extract_text_from_input(full_text)
+    search_text = extract_search_text_from_input(search_text)
+    sections = search_text.split(sep = " ")
+    found_positions = {}
+    for x in sections:
+        text_start_pos = 0
+        while text_start_pos != -1:
+            text_start_pos = full_text.find(x, text_start_pos)
+            if text_start_pos != -1:
+                found_positions[text_start_pos] = text_start_pos + len(x)
+                text_start_pos += 1
+    # Combine overlapping or adjacent positions
+    sorted_starts = sorted(found_positions.keys())
+    combined_positions = []
+    if sorted_starts:
+        current_start, current_end = sorted_starts[0], found_positions[sorted_starts[0]]
+        for start in sorted_starts[1:]:
+            if start <= (current_end + 10):
+                current_end = max(current_end, found_positions[start])
+            else:
+                combined_positions.append((current_start, current_end))
+                current_start, current_end = start, found_positions[start]
+        combined_positions.append((current_start, current_end))
+    # Construct pos_tokens
+    pos_tokens = []
+    prev_end = 0
+    for start, end in combined_positions:
+        if end-start > 1: # Only combine if there is a significant amount of matched text. Avoids picking up single words like 'and' etc.
+            pos_tokens.append(full_text[prev_end:start])
+            pos_tokens.append('<mark style="color:black;">' + full_text[start:end] + '</mark>')
+            prev_end = end
+    pos_tokens.append(full_text[prev_end:])
+    return "".join(pos_tokens), combined_positions
+def create_rich_text_cell_from_positions(full_text, combined_positions):
+    # Construct pos_tokens
+    red = InlineFont(color='00FF0000')
+    rich_text_cell = CellRichText()
+    prev_end = 0
+    for start, end in combined_positions:
+        if end-start > 1: # Only combine if there is a significant amount of matched text. Avoids picking up single words like 'and' etc.
+            rich_text_cell.append(full_text[prev_end:start])
+            rich_text_cell.append(TextBlock(red, full_text[start:end]))
+            prev_end = end
+    rich_text_cell.append(full_text[prev_end:])
+    return rich_text_cell
+def create_highlighted_excel_wb(df, search_text, column_to_highlight):
+    # Create a new Excel workbook
+    wb = Workbook()
+    sheet = wb.active
+    # Insert headers into the worksheet, make bold
+    sheet.append(df.columns.tolist())
+    for cell in sheet[1]:
+        cell.font = Font(bold=True)
+    # Find substrings in cells and highlight
+    for r_idx, row in enumerate(df.itertuples(), start=2):
+        for c_idx, cell_value in enumerate(row[1:], start=1):
+            sheet.cell(row=r_idx, column=c_idx, value=cell_value)
+            if df.columns[c_idx - 1] == column_to_highlight:
+                html_text, combined_positions = highlight_found_text(search_text, cell_value)
+                sheet.cell(row=r_idx, column=c_idx).value = create_rich_text_cell_from_positions(cell_value, combined_positions)
+    return wb