File size: 6,748 Bytes
6319afc 0ea8b9e 6319afc 0ea8b9e 6319afc 0ea8b9e 6319afc 0ea8b9e 6319afc 0ea8b9e 6319afc 0ea8b9e 6319afc 0ea8b9e 6319afc 25c9832 0ea8b9e 6319afc 0ea8b9e 6319afc 0ea8b9e 6319afc 0ea8b9e 6319afc 0ea8b9e 6319afc 0ea8b9e 6319afc 0ea8b9e 6319afc 25c9832 6319afc 25c9832 6319afc 25c9832 0ea8b9e 25c9832 0ea8b9e 25c9832 0ea8b9e 25c9832 0ea8b9e 25c9832 0ea8b9e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
import os
import tempfile
import socket
from datetime import datetime
from dotenv import load_dotenv
from tldextract import TLDExtract
today_rev = datetime.now().strftime("%Y%m%d")
host_name = socket.gethostname()
# Set or retrieve configuration variables for the redaction app
def get_or_create_env_var(var_name:str, default_value:str, print_val:bool=False):
'''
Get an environmental variable, and set it to a default value if it doesn't exist
'''
# Get the environment variable if it exists
value = os.environ.get(var_name)
# If it doesn't exist, set the environment variable to the default value
if value is None:
os.environ[var_name] = default_value
value = default_value
if print_val == True:
print(f'The value of {var_name} is {value}')
return value
# If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. '/env/app_config.env'
APP_CONFIG_PATH = get_or_create_env_var('APP_CONFIG_PATH', '')
if os.path.exists(APP_CONFIG_PATH):
print(f"Loading APP variables from config file {APP_CONFIG_PATH}")
load_dotenv(APP_CONFIG_PATH)
###
# AWS CONFIG
###
# If you have an aws_config env file in the config folder, you can load in AWS keys this way, e.g. '/env/aws_config.env'
AWS_CONFIG_PATH = get_or_create_env_var('AWS_CONFIG_PATH', '')
if os.path.exists(AWS_CONFIG_PATH):
print(f"Loading AWS variables from config file {AWS_CONFIG_PATH}")
load_dotenv(AWS_CONFIG_PATH)
RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
AWS_CLIENT_ID = get_or_create_env_var('AWS_CLIENT_ID', '')
AWS_CLIENT_SECRET = get_or_create_env_var('AWS_CLIENT_SECRET', '')
AWS_USER_POOL_ID = get_or_create_env_var('AWS_USER_POOL_ID', '')
AWS_ACCESS_KEY = get_or_create_env_var('AWS_ACCESS_KEY', '')
if AWS_ACCESS_KEY: print(f'AWS_ACCESS_KEY found in environment variables')
AWS_SECRET_KEY = get_or_create_env_var('AWS_SECRET_KEY', '')
if AWS_SECRET_KEY: print(f'AWS_SECRET_KEY found in environment variables')
DOCUMENT_REDACTION_BUCKET = get_or_create_env_var('DOCUMENT_REDACTION_BUCKET', '')
# Custom headers e.g. if routing traffic through Cloudfront
# Retrieving or setting CUSTOM_HEADER
CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
if CUSTOM_HEADER: print(f'CUSTOM_HEADER found')
# Retrieving or setting CUSTOM_HEADER_VALUE
CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
if CUSTOM_HEADER_VALUE: print(f'CUSTOM_HEADER_VALUE found')
###
# Images config
###
IMAGES_DPI = get_or_create_env_var('IMAGES_DPI', '300.0')
LOAD_TRUNCATED_IMAGES = get_or_create_env_var('LOAD_TRUNCATED_IMAGES', 'True')
MAX_IMAGE_PIXELS = get_or_create_env_var('MAX_IMAGE_PIXELS', '') # Changed to None if blank in file_conversion.py
###
# File I/O config
###
SESSION_OUTPUT_FOLDER = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'True') # i.e. do you want your input and output folders saved within a subfolder based on session hash value within output/input folders
OUTPUT_FOLDER = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/') # 'output/'
INPUT_FOLDER = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/') # 'input/'
# Allow for files to be saved in a temporary folder for increased security in some instances
if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
# Create a temporary directory
with tempfile.TemporaryDirectory() as temp_dir:
print(f'Temporary directory created at: {temp_dir}')
if OUTPUT_FOLDER == "TEMP": OUTPUT_FOLDER = temp_dir + "/"
if INPUT_FOLDER == "TEMP": INPUT_FOLDER = temp_dir + "/"
FEEDBACK_LOGS_FOLDER = get_or_create_env_var('FEEDBACK_LOGS_FOLDER', 'feedback/' + today_rev + '/' + host_name + '/')
USAGE_LOGS_FOLDER = get_or_create_env_var('USAGE_LOGS_FOLDER', 'logs/' + today_rev + '/' + host_name + '/')
ACCESS_LOGS_FOLDER = get_or_create_env_var('ACCESS_LOGS_FOLDER', 'usage/' + today_rev + '/' + host_name + '/')
DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False')
###
# REDACTION CONFIG
###
TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "tesseract/")
POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "poppler/poppler-24.02.0/Library/bin/")
SHOW_BULK_TEXTRACT_CALL_OPTIONS = get_or_create_env_var('SHOW_BULK_TEXTRACT_CALL_OPTIONS', 'False') # This feature not currently implemented
# Number of pages to loop through before breaking the function and restarting from the last finished page (not currently activated).
PAGE_BREAK_VALUE = get_or_create_env_var('PAGE_BREAK_VALUE', '99999')
MAX_TIME_VALUE = get_or_create_env_var('MAX_TIME_VALUE', '999999')
CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "")
REDACTION_LANGUAGE = get_or_create_env_var("REDACTION_LANGUAGE", "en") # Currently only English is supported by the app
###
# APP RUN CONFIG
###
TLDEXTRACT_CACHE = get_or_create_env_var('TLDEXTRACT_CACHE', 'tld/.tld_set_snapshot')
extract = TLDExtract(cache_dir=TLDEXTRACT_CACHE)
# Get some environment variables and Launch the Gradio app
COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
RUN_DIRECT_MODE = get_or_create_env_var('RUN_DIRECT_MODE', '0')
MAX_QUEUE_SIZE = int(get_or_create_env_var('MAX_QUEUE_SIZE', '5'))
MAX_FILE_SIZE = get_or_create_env_var('MAX_FILE_SIZE', '250mb')
GRADIO_SERVER_PORT = int(get_or_create_env_var('GRADIO_SERVER_PORT', '7860'))
ROOT_PATH = get_or_create_env_var('ROOT_PATH', '')
DEFAULT_CONCURRENCY_LIMIT = get_or_create_env_var('DEFAULT_CONCURRENCY_LIMIT', '3')
GET_DEFAULT_ALLOW_LIST = get_or_create_env_var('GET_DEFAULT_ALLOW_LIST', 'False')
ALLOW_LIST_PATH = get_or_create_env_var('ALLOW_LIST_PATH', '') # config/default_allow_list.csv
S3_ALLOW_LIST_PATH = get_or_create_env_var('S3_ALLOW_LIST_PATH', '') # default_allow_list.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
SHOW_COSTS = get_or_create_env_var('SHOW_COSTS', 'True')
GET_COST_CODES = get_or_create_env_var('GET_COST_CODES', 'False')
COST_CODES_PATH = get_or_create_env_var('COST_CODES_PATH', '') # 'config/COST_CENTRES.csv' # file should be a csv file with a single table in it that has two columns with a header. First column should contain cost codes, second column should contain a name or description for the cost code
S3_COST_CODES_PATH = get_or_create_env_var('S3_COST_CODES_PATH', '') # COST_CENTRES.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
ENFORCE_COST_CODES = get_or_create_env_var('ENFORCE_COST_CODES', 'False') # If you have cost codes listed, are they compulsory?
if ENFORCE_COST_CODES == 'True': GET_COST_CODES = 'True'
if GET_COST_CODES == 'True': ENFORCE_COST_CODES = 'False' |