Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Dec 29, 2024

Commit

c9e23cb

1 Parent(s): a03496e

Adapted text join options to review file to be more resilient to changes in image size. Added possibility of using client secret with AWS login

Browse files

Files changed (4) hide show

app.py +2 -2
tools/auth.py +39 -9
tools/file_conversion.py +52 -2
tools/redaction_review.py +1 -1

app.py CHANGED Viewed

@@ -458,9 +458,9 @@ with app:
     then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
 # Get some environment variables and Launch the Gradio app
-COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
 print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
 RUN_DIRECT_MODE = get_or_create_env_var('RUN_DIRECT_MODE', '0')
 print(f'The value of RUN_DIRECT_MODE is {RUN_DIRECT_MODE}')

     then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
 # Get some environment variables and Launch the Gradio app
+COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '1')
 print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
+1
 RUN_DIRECT_MODE = get_or_create_env_var('RUN_DIRECT_MODE', '0')
 print(f'The value of RUN_DIRECT_MODE is {RUN_DIRECT_MODE}')

tools/auth.py CHANGED Viewed

@@ -1,15 +1,31 @@
 import boto3
 import gradio as gr
 from tools.helper_functions import get_or_create_env_var
-client_id = get_or_create_env_var('AWS_CLIENT_ID', '') # This client id is borrowed from async gradio app client
 print(f'The value of AWS_CLIENT_ID is {client_id}')
-user_pool_id = get_or_create_env_var('AWS_USER_POOL_ID', '')
 print(f'The value of AWS_USER_POOL_ID is {user_pool_id}')
-def authenticate_user(username:str, password:str, user_pool_id:str=user_pool_id, client_id:str=client_id):
     """Authenticates a user against an AWS Cognito user pool.
     Args:
@@ -17,6 +33,7 @@ def authenticate_user(username:str, password:str, user_pool_id:str=user_pool_id,
         client_id (str): The ID of the Cognito user pool client.
         username (str): The username of the user.
         password (str): The password of the user.
     Returns:
         bool: True if the user is authenticated, False otherwise.
@@ -24,15 +41,28 @@ def authenticate_user(username:str, password:str, user_pool_id:str=user_pool_id,
     client = boto3.client('cognito-idp')  # Cognito Identity Provider client
     try:
         response = client.initiate_auth(
-            AuthFlow='USER_PASSWORD_AUTH',
-            AuthParameters={
-                'USERNAME': username,
-                'PASSWORD': password,
-            },
-            ClientId=client_id
         )
         # If successful, you'll receive an AuthenticationResult in the response

 import boto3
 import gradio as gr
+import hmac
+import hashlib
+import base64
 from tools.helper_functions import get_or_create_env_var
+client_id = get_or_create_env_var('AWS_CLIENT_ID', '3qs30degqvip8ade8iv44c4edf')
 print(f'The value of AWS_CLIENT_ID is {client_id}')
+client_secret = get_or_create_env_var('AWS_CLIENT_SECRET', 'cvgd27dihp88jktc71lmjaq2kgntjdkt6703m63mdfjv9j58mqo')
+print(f'The value of AWS_CLIENT_SECRET is {client_secret}')
+user_pool_id = get_or_create_env_var('AWS_USER_POOL_ID', 'eu-west-2_7Jhnih7D1')
 print(f'The value of AWS_USER_POOL_ID is {user_pool_id}')
+def calculate_secret_hash(client_id, client_secret, username):
+    message = username + client_id
+    dig = hmac.new(
+        str(client_secret).encode('utf-8'),
+        msg=str(message).encode('utf-8'),
+        digestmod=hashlib.sha256
+    ).digest()
+    secret_hash = base64.b64encode(dig).decode()
+    return secret_hash
+def authenticate_user(username:str, password:str, user_pool_id:str=user_pool_id, client_id:str=client_id, client_secret:str=client_secret):
     """Authenticates a user against an AWS Cognito user pool.
     Args:
         client_id (str): The ID of the Cognito user pool client.
         username (str): The username of the user.
         password (str): The password of the user.
+        client_secret (str): The client secret of the app client
     Returns:
         bool: True if the user is authenticated, False otherwise.
     client = boto3.client('cognito-idp')  # Cognito Identity Provider client
+    # Compute the secret hash
+    secret_hash = calculate_secret_hash(client_id, client_secret, username)
     try:
+        # response = client.initiate_auth(
+        #     AuthFlow='USER_PASSWORD_AUTH',
+        #     AuthParameters={
+        #         'USERNAME': username,
+        #         'PASSWORD': password,
+        #     },
+        #     ClientId=client_id
+        # )
         response = client.initiate_auth(
+        AuthFlow='USER_PASSWORD_AUTH',
+        AuthParameters={
+            'USERNAME': username,
+            'PASSWORD': password,
+            'SECRET_HASH': secret_hash
+        },
+        ClientId=client_id
         )
         # If successful, you'll receive an AuthenticationResult in the response

tools/file_conversion.py CHANGED Viewed

@@ -650,6 +650,53 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str],
     return out_message, out_file_paths
 def convert_review_json_to_pandas_df(data:List[dict], text_join_data=pd.DataFrame) -> pd.DataFrame:
     # Flatten the data
@@ -691,8 +738,11 @@ def convert_review_json_to_pandas_df(data:List[dict], text_join_data=pd.DataFram
         text_join_data['page'] = text_join_data['page'].astype(str)
         df['page'] = df['page'].astype(str)
         text_join_data = text_join_data[['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page', 'text']]
-        text_join_data[['xmin', 'ymin', 'xmax', 'ymax']] = text_join_data[['xmin', 'ymin', 'xmax', 'ymax']].astype(float).round(0)
-        df[['xmin1', 'ymin1', 'xmax1', 'ymax1']] = df[['xmin', 'ymin', 'xmax', 'ymax']].astype(float).round(0)
         df = df.merge(text_join_data, left_on = ['xmin1', 'ymin1', 'xmax1', 'ymax1', 'label', 'page'], right_on = ['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page'], how = "left", suffixes=("", "_y"))

     return out_message, out_file_paths
+# Example DataFrames
+# df1 = pd.DataFrame({
+#     'xmin': [10, 20, 30],
+#     'xmax': [15, 25, 35],
+#     'ymin': [40, 50, 60],
+#     'ymax': [45, 55, 65],
+#     'info1': ['A', 'B', 'C']
+# })
+# df2 = pd.DataFrame({
+#     'xmin': [12, 18, 32],
+#     'xmax': [14, 24, 34],
+#     'ymin': [42, 48, 62],
+#     'ymax': [44, 54, 66],
+#     'info2': ['X', 'Y', 'Z']
+# })
+def join_values_within_threshold(df1, df2):
+    # Threshold for matching
+    threshold = 5
+    # Perform a cross join
+    df1['key'] = 1
+    df2['key'] = 1
+    merged = pd.merge(df1, df2, on='key').drop(columns=['key'])
+    # Apply conditions for all columns
+    conditions = (
+        (abs(merged['xmin_x'] - merged['xmin_y']) <= threshold) &
+        (abs(merged['xmax_x'] - merged['xmax_y']) <= threshold) &
+        (abs(merged['ymin_x'] - merged['ymin_y']) <= threshold) &
+        (abs(merged['ymax_x'] - merged['ymax_y']) <= threshold)
+    )
+    # Filter rows that satisfy all conditions
+    filtered = merged[conditions]
+    # Drop duplicates if needed (e.g., keep only the first match for each row in df1)
+    result = filtered.drop_duplicates(subset=['xmin_x', 'xmax_x', 'ymin_x', 'ymax_x'])
+    # Merge back into the original DataFrame (if necessary)
+    final_df = pd.merge(df1, result, left_on=['xmin', 'xmax', 'ymin', 'ymax'], right_on=['xmin_x', 'xmax_x', 'ymin_x', 'ymax_x'], how='left')
+    # Clean up extra columns
+    final_df = final_df.drop(columns=['key'])
+    print(final_df)
 def convert_review_json_to_pandas_df(data:List[dict], text_join_data=pd.DataFrame) -> pd.DataFrame:
     # Flatten the data
         text_join_data['page'] = text_join_data['page'].astype(str)
         df['page'] = df['page'].astype(str)
         text_join_data = text_join_data[['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page', 'text']]
+        # Round to the closest number divisible by 5
+        text_join_data[['xmin', 'ymin', 'xmax', 'ymax']] = (text_join_data[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
+        text_join_data = text_join_data.drop_duplicates(['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page'])
+        df[['xmin1', 'ymin1', 'xmax1', 'ymax1']] = (df[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
         df = df.merge(text_join_data, left_on = ['xmin1', 'ymin1', 'xmax1', 'ymax1', 'label', 'page'], right_on = ['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page'], how = "left", suffixes=("", "_y"))

tools/redaction_review.py CHANGED Viewed

@@ -41,7 +41,7 @@ def increase_page(number:int, image_annotator_object:AnnotatedImageData):
 def update_zoom(current_zoom_level:int, annotate_current_page:int, decrease:bool=True):
     if decrease == False:
-        if current_zoom_level >= 50:
             current_zoom_level -= 10
     else:
         if current_zoom_level < 100:

 def update_zoom(current_zoom_level:int, annotate_current_page:int, decrease:bool=True):
     if decrease == False:
+        if current_zoom_level >= 70:
             current_zoom_level -= 10
     else:
         if current_zoom_level < 100: