Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Nov 25, 2024

Commit

bf7bb79

1 Parent(s): 68a91f4

Modified Dockerfile hopefully to not need Lambda overrides. Looking into custom headers from Cloudfront to try to get them to work

Browse files

Files changed (4) hide show

Dockerfile +12 -6
app.py +3 -4
tools/auth.py +6 -2
tools/helper_functions.py +23 -19

Dockerfile CHANGED Viewed

@@ -51,12 +51,15 @@ RUN mkdir -p /home/user/app/output \
 # Copy installed packages from builder stage
 COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
-# Use a conditional entrypoint based on the APP_MODE argument
-RUN if [ "$APP_MODE" = "lambda" ]; then \
-        echo '#!/bin/sh\nexec python -m awslambdaric' > /entrypoint.sh; \
-    else \
-        echo '#!/bin/sh\nexec python app.py' > /entrypoint.sh; \
-    fi && chmod +x /entrypoint.sh
 # Switch to the "user" user
 USER user
@@ -71,6 +74,7 @@ ENV HOME=/home/user \
     GRADIO_NUM_PORTS=1 \
     GRADIO_SERVER_NAME=0.0.0.0 \
     GRADIO_SERVER_PORT=7860 \
     GRADIO_THEME=huggingface \
     TLDEXTRACT_CACHE=$HOME/app/tld/.tld_set_snapshot \
     SYSTEM=spaces
@@ -81,6 +85,8 @@ WORKDIR $HOME/app
 # Copy the app code to the container
 COPY --chown=user . $HOME/app
 ENTRYPOINT [ "/entrypoint.sh" ]
 # Default command for Lambda mode

 # Copy installed packages from builder stage
 COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
+# Use a conditional entrypoint based on the APP_MODE argument (deprecated, now created beforehand in folder)
+# RUN if [ "$APP_MODE" = "lambda" ]; then \
+#         echo '#!/bin/sh\nexec python -m awslambdaric' > /entrypoint.sh; \
+#     else \
+#         echo '#!/bin/sh\nexec python app.py' > /entrypoint.sh; \
+#     fi && chmod +x /entrypoint.sh
+# Entrypoint helps to switch between Gradio and Lambda mode
+COPY entrypoint.sh /entrypoint.sh
 # Switch to the "user" user
 USER user
     GRADIO_NUM_PORTS=1 \
     GRADIO_SERVER_NAME=0.0.0.0 \
     GRADIO_SERVER_PORT=7860 \
+    GRADIO_ANALYTICS_ENABLED=False \
     GRADIO_THEME=huggingface \
     TLDEXTRACT_CACHE=$HOME/app/tld/.tld_set_snapshot \
     SYSTEM=spaces
 # Copy the app code to the container
 COPY --chown=user . $HOME/app
+RUN chmod +x /entrypoint.sh
 ENTRYPOINT [ "/entrypoint.sh" ]
 # Default command for Lambda mode

app.py CHANGED Viewed

@@ -19,7 +19,6 @@ from tools.auth import authenticate_user
 from tools.load_spacy_model_custom_recognisers import custom_entities
 from tools.custom_csvlogger import CSVLogger_custom
 today_rev = datetime.now().strftime("%Y%m%d")
 add_folder_to_path("tesseract/")
@@ -286,7 +285,7 @@ with app:
     then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state], api_name="prepare_doc").\
     then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
-                    then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom])
     # If the app has completed a batch of pages, it will run this until the end of all pages in the document
     current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
@@ -397,7 +396,7 @@ with app:
     then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
 # Launch the Gradio app
-COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
 print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
 RUN_DIRECT_MODE = get_or_create_env_var('RUN_DIRECT_MODE', '0')
@@ -411,7 +410,7 @@ if __name__ == "__main__":
     if RUN_DIRECT_MODE == "0":
         if os.environ['COGNITO_AUTH'] == "1":
-            app.queue(max_size=max_queue_size).launch(show_error=True, auth=authenticate_user, max_file_size=max_file_size)
         else:
             app.queue(max_size=max_queue_size).launch(show_error=True, inbrowser=True, max_file_size=max_file_size)

 from tools.load_spacy_model_custom_recognisers import custom_entities
 from tools.custom_csvlogger import CSVLogger_custom
 today_rev = datetime.now().strftime("%Y%m%d")
 add_folder_to_path("tesseract/")
     then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state], api_name="prepare_doc").\
     then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
+                    then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
     # If the app has completed a batch of pages, it will run this until the end of all pages in the document
     current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
     then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
 # Launch the Gradio app
+COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '1')
 print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
 RUN_DIRECT_MODE = get_or_create_env_var('RUN_DIRECT_MODE', '0')
     if RUN_DIRECT_MODE == "0":
         if os.environ['COGNITO_AUTH'] == "1":
+            app.queue(max_size=max_queue_size).launch(show_error=True, auth=authenticate_user, max_file_size=max_file_size, show_api=False)
         else:
             app.queue(max_size=max_queue_size).launch(show_error=True, inbrowser=True, max_file_size=max_file_size)

tools/auth.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import boto3
 from tools.helper_functions import get_or_create_env_var
 client_id = get_or_create_env_var('AWS_CLIENT_ID', '') # This client id is borrowed from async gradio app client
@@ -8,7 +9,9 @@ print(f'The value of AWS_CLIENT_ID is {client_id}')
 user_pool_id = get_or_create_env_var('AWS_USER_POOL_ID', '')
 print(f'The value of AWS_USER_POOL_ID is {user_pool_id}')
-def authenticate_user(username, password, user_pool_id=user_pool_id, client_id=client_id):
     """Authenticates a user against an AWS Cognito user pool.
     Args:
@@ -24,6 +27,7 @@ def authenticate_user(username, password, user_pool_id=user_pool_id, client_id=c
     client = boto3.client('cognito-idp')  # Cognito Identity Provider client
     try:
         response = client.initiate_auth(
             AuthFlow='USER_PASSWORD_AUTH',
             AuthParameters={
@@ -45,4 +49,4 @@ def authenticate_user(username, password, user_pool_id=user_pool_id, client_id=c
         return False
     except Exception as e:
         print(f"An error occurred: {e}")
-        return False

 import boto3
+import gradio as gr
 from tools.helper_functions import get_or_create_env_var
 client_id = get_or_create_env_var('AWS_CLIENT_ID', '') # This client id is borrowed from async gradio app client
 user_pool_id = get_or_create_env_var('AWS_USER_POOL_ID', '')
 print(f'The value of AWS_USER_POOL_ID is {user_pool_id}')
+def authenticate_user(username:str, password:str, user_pool_id:str=user_pool_id, client_id:str=client_id):
     """Authenticates a user against an AWS Cognito user pool.
     Args:
     client = boto3.client('cognito-idp')  # Cognito Identity Provider client
     try:
         response = client.initiate_auth(
             AuthFlow='USER_PASSWORD_AUTH',
             AuthParameters={
         return False
     except Exception as e:
         print(f"An error occurred: {e}")
+        return False

tools/helper_functions.py CHANGED Viewed

@@ -211,7 +211,15 @@ def wipe_logs(feedback_logs_loc, usage_logs_loc):
         os.remove(usage_logs_loc)
     except Exception as e:
         print("Could not remove usage logs file", e)
 async def get_connection_params(request: gr.Request):
     base_folder = ""
@@ -223,29 +231,25 @@ async def get_connection_params(request: gr.Request):
     #if 'context' in request_data:
     #     print("Request context dictionary:", request_data['context'])
-    # print("Request headers dictionary:", request.headers)
-    # print("All host elements", request.client)
-    # print("IP address:", request.client.host)
-    # print("Query parameters:", dict(request.query_params))
     # To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
     #print("Request dictionary to object:", request.request.body())
     print("Session hash:", request.session_hash)
-    # Retrieving or setting CUSTOM_CLOUDFRONT_HEADER
-    CUSTOM_CLOUDFRONT_HEADER_var = get_or_create_env_var('CUSTOM_CLOUDFRONT_HEADER', '')
-    #print(f'The value of CUSTOM_CLOUDFRONT_HEADER is {CUSTOM_CLOUDFRONT_HEADER_var}')
-    # Retrieving or setting CUSTOM_CLOUDFRONT_HEADER_VALUE
-    CUSTOM_CLOUDFRONT_HEADER_VALUE_var = get_or_create_env_var('CUSTOM_CLOUDFRONT_HEADER_VALUE', '')
-    #print(f'The value of CUSTOM_CLOUDFRONT_HEADER_VALUE_var is {CUSTOM_CLOUDFRONT_HEADER_VALUE_var}')
-    if CUSTOM_CLOUDFRONT_HEADER_var and CUSTOM_CLOUDFRONT_HEADER_VALUE_var:
-        if CUSTOM_CLOUDFRONT_HEADER_var in request.headers:
-            supplied_cloudfront_custom_value = request.headers[CUSTOM_CLOUDFRONT_HEADER_var]
-            if supplied_cloudfront_custom_value == CUSTOM_CLOUDFRONT_HEADER_VALUE_var:
-                print("Custom Cloudfront header found:", supplied_cloudfront_custom_value)
             else:
-                raise(ValueError, "Custom Cloudfront header value does not match expected value.")
     # Get output save folder from 1 - username passed in from direct Cognito login, 2 - Cognito ID header passed through a Lambda authenticator, 3 - the session hash.

         os.remove(usage_logs_loc)
     except Exception as e:
         print("Could not remove usage logs file", e)
+# Retrieving or setting CUSTOM_HEADER
+CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', 'custom_header')
+print(f'CUSTOM_HEADER found')
+# Retrieving or setting CUSTOM_HEADER_VALUE
+CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', 'custom_header_value')
+print(f'CUSTOM_HEADER_VALUE found')
 async def get_connection_params(request: gr.Request):
     base_folder = ""
     #if 'context' in request_data:
     #     print("Request context dictionary:", request_data['context'])
+    print("Request headers dictionary:", request.headers)
+    print("All host elements", request.client)
+    print("IP address:", request.client.host)
+    print("Query parameters:", dict(request.query_params))
     # To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
     #print("Request dictionary to object:", request.request.body())
     print("Session hash:", request.session_hash)
+    if CUSTOM_HEADER and CUSTOM_HEADER_VALUE:
+            if CUSTOM_HEADER in request.headers:
+                supplied_custom_header_value = request.headers[CUSTOM_HEADER]
+                if supplied_custom_header_value == CUSTOM_HEADER_VALUE:
+                    print("Custom header supplied and matches CUSTOM_HEADER_VALUE")
+                else:
+                    print("Custom header value does not match expected value.")
+                    raise ValueError("Custom header value does not match expected value.")
             else:
+                print("Custom header value not found.")
+                raise ValueError("Custom header value not found.")
     # Get output save folder from 1 - username passed in from direct Cognito login, 2 - Cognito ID header passed through a Lambda authenticator, 3 - the session hash.