Spaces:
Sleeping
Sleeping
Commit
·
651ef78
1
Parent(s):
30b5dc1
Added additional password auth for AWS-based files. Changed 'Clean' default to no
Browse files- .gitignore +3 -1
- app.py +3 -2
- search_funcs/aws_functions.py +36 -33
.gitignore
CHANGED
|
@@ -17,6 +17,7 @@
|
|
| 17 |
*.pkl.gz
|
| 18 |
*.pem
|
| 19 |
*.json.out
|
|
|
|
| 20 |
docs/*
|
| 21 |
build/*
|
| 22 |
dist/*
|
|
@@ -25,4 +26,5 @@ db/*
|
|
| 25 |
experiments/*
|
| 26 |
model/*
|
| 27 |
build_deps/*
|
| 28 |
-
build_deps_old/*
|
|
|
|
|
|
| 17 |
*.pkl.gz
|
| 18 |
*.pem
|
| 19 |
*.json.out
|
| 20 |
+
*.env
|
| 21 |
docs/*
|
| 22 |
build/*
|
| 23 |
dist/*
|
|
|
|
| 26 |
experiments/*
|
| 27 |
model/*
|
| 28 |
build_deps/*
|
| 29 |
+
build_deps_old/*
|
| 30 |
+
AWS errors 19-03.txt
|
app.py
CHANGED
|
@@ -157,6 +157,7 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
| 157 |
in_join_column = gr.Dropdown(label="Column to join in new data frame")
|
| 158 |
search_df_join_column = gr.Dropdown(label="Column to join in search data frame")
|
| 159 |
with gr.Accordion(label = "AWS data access", open = False):
|
|
|
|
| 160 |
with gr.Row():
|
| 161 |
in_aws_keyword_file = gr.Dropdown(label="Choose keyword file to load from AWS (only valid for API Gateway app)", choices=["None", "Bioasq - Biomedical example data - keyword search"])
|
| 162 |
load_aws_keyword_data_button = gr.Button(value="Load keyword data from AWS", variant="secondary")
|
|
@@ -176,8 +177,8 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
| 176 |
in_no_search_results_button.click(display_info, inputs=in_no_search_info)
|
| 177 |
|
| 178 |
### Loading AWS data ###
|
| 179 |
-
load_aws_keyword_data_button.click(fn=load_data_from_aws, inputs=[in_aws_keyword_file], outputs=[in_bm25_file, out_aws_data_message])
|
| 180 |
-
load_aws_semantic_data_button.click(fn=load_data_from_aws, inputs=[in_aws_semantic_file], outputs=[in_semantic_file, out_aws_data_message])
|
| 181 |
|
| 182 |
|
| 183 |
### BM25 SEARCH ###
|
|
|
|
| 157 |
in_join_column = gr.Dropdown(label="Column to join in new data frame")
|
| 158 |
search_df_join_column = gr.Dropdown(label="Column to join in search data frame")
|
| 159 |
with gr.Accordion(label = "AWS data access", open = False):
|
| 160 |
+
aws_password_box = gr.Textbox(label="Password for AWS data access (ask Data team if you don't have this)")
|
| 161 |
with gr.Row():
|
| 162 |
in_aws_keyword_file = gr.Dropdown(label="Choose keyword file to load from AWS (only valid for API Gateway app)", choices=["None", "Bioasq - Biomedical example data - keyword search"])
|
| 163 |
load_aws_keyword_data_button = gr.Button(value="Load keyword data from AWS", variant="secondary")
|
|
|
|
| 177 |
in_no_search_results_button.click(display_info, inputs=in_no_search_info)
|
| 178 |
|
| 179 |
### Loading AWS data ###
|
| 180 |
+
load_aws_keyword_data_button.click(fn=load_data_from_aws, inputs=[in_aws_keyword_file, aws_password_box], outputs=[in_bm25_file, out_aws_data_message])
|
| 181 |
+
load_aws_semantic_data_button.click(fn=load_data_from_aws, inputs=[in_aws_semantic_file, aws_password_box], outputs=[in_semantic_file, out_aws_data_message])
|
| 182 |
|
| 183 |
|
| 184 |
### BM25 SEARCH ###
|
search_funcs/aws_functions.py
CHANGED
|
@@ -106,60 +106,63 @@ def download_files_from_s3(bucket_name, s3_folder, local_folder, filenames):
|
|
| 106 |
|
| 107 |
|
| 108 |
|
| 109 |
-
def load_data_from_aws(in_aws_keyword_file, bucket_name=bucket_name):
|
| 110 |
|
| 111 |
temp_dir = tempfile.mkdtemp()
|
| 112 |
local_keyword_stub = temp_dir + '/keyword/'
|
| 113 |
local_semantic_stub = temp_dir + '/semantic/'
|
| 114 |
|
| 115 |
files = []
|
|
|
|
|
|
|
| 116 |
|
| 117 |
-
|
| 118 |
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
s3_folder_stub = s3_folder_stub + 'keyword/'
|
| 123 |
-
local_folder_path = local_keyword_stub
|
| 124 |
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
|
| 134 |
-
|
| 135 |
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
|
| 148 |
-
|
| 149 |
|
| 150 |
-
|
| 151 |
-
|
| 152 |
|
| 153 |
-
|
| 154 |
-
|
| 155 |
|
| 156 |
-
|
| 157 |
|
| 158 |
-
|
| 159 |
-
|
| 160 |
|
|
|
|
|
|
|
|
|
|
| 161 |
else:
|
| 162 |
-
out_message = "
|
| 163 |
print(out_message)
|
| 164 |
|
| 165 |
return files, out_message
|
|
|
|
| 106 |
|
| 107 |
|
| 108 |
|
| 109 |
+
def load_data_from_aws(in_aws_keyword_file, aws_password="", bucket_name=bucket_name):
|
| 110 |
|
| 111 |
temp_dir = tempfile.mkdtemp()
|
| 112 |
local_keyword_stub = temp_dir + '/keyword/'
|
| 113 |
local_semantic_stub = temp_dir + '/semantic/'
|
| 114 |
|
| 115 |
files = []
|
| 116 |
+
if aws_password:
|
| 117 |
+
if "Bioasq - Biomedical example data" in in_aws_keyword_file and aws_password == os.environ['BIOASQ_PASSWORD']:
|
| 118 |
|
| 119 |
+
s3_folder_stub = 'example_data/bioasq/latest/'
|
| 120 |
|
| 121 |
+
if 'keyword' in in_aws_keyword_file:
|
| 122 |
+
s3_folder_stub = s3_folder_stub + 'keyword/'
|
| 123 |
+
local_folder_path = local_keyword_stub
|
|
|
|
|
|
|
| 124 |
|
| 125 |
+
if 'semantic' in in_aws_keyword_file:
|
| 126 |
+
s3_folder_stub = s3_folder_stub + 'semantic/'
|
| 127 |
+
local_folder_path = local_semantic_stub
|
| 128 |
+
|
| 129 |
|
| 130 |
+
# Check if folder exists
|
| 131 |
+
if not os.path.exists(local_folder_path):
|
| 132 |
+
print(f"Folder {local_folder_path} does not exist! Making folder.")
|
| 133 |
|
| 134 |
+
os.mkdir(local_folder_path)
|
| 135 |
|
| 136 |
+
# Check if folder is empty
|
| 137 |
+
if len(os.listdir(local_folder_path)) == 0:
|
| 138 |
+
print(f"Folder {local_folder_path} is empty")
|
| 139 |
|
| 140 |
+
if 'keyword' in in_aws_keyword_file:
|
| 141 |
+
# Download keyword folder
|
| 142 |
+
download_files_from_s3(bucket_name, s3_folder_stub, local_folder_path, filenames='*')
|
| 143 |
|
| 144 |
+
if 'semantic' in in_aws_keyword_file:
|
| 145 |
+
# Download keyword folder
|
| 146 |
+
download_files_from_s3(bucket_name, s3_folder_stub, local_folder_path, filenames=['mini-bioasq-0000_cleaned_bge_embedding_compress.npz', 'mini-bioasq-0000_cleaned_prepared_docs.pkl.gz'])
|
| 147 |
|
| 148 |
+
print("AWS data downloaded")
|
| 149 |
|
| 150 |
+
else:
|
| 151 |
+
print(f"Folder {local_folder_path} is not empty")
|
| 152 |
|
| 153 |
+
#files = os.listdir(local_folder_stub)
|
| 154 |
+
#print(files)
|
| 155 |
|
| 156 |
+
files = [os.path.join(local_folder_path, f) for f in os.listdir(local_folder_path) if os.path.isfile(os.path.join(local_folder_path, f))]
|
| 157 |
|
| 158 |
+
out_message = "Data successfully loaded from AWS"
|
| 159 |
+
print(out_message)
|
| 160 |
|
| 161 |
+
else:
|
| 162 |
+
out_message = "Data not loaded from AWS"
|
| 163 |
+
print(out_message)
|
| 164 |
else:
|
| 165 |
+
out_message = "No password provided. Please ask the data team for access if you need this."
|
| 166 |
print(out_message)
|
| 167 |
|
| 168 |
return files, out_message
|