seanpedrickcase commited on
Commit
a7566b9
·
1 Parent(s): 95ca426

Adapted Dockerfile for systems with read only file system. Minor package updates.

Browse files
Files changed (6) hide show
  1. .dockerignore +14 -6
  2. .gitignore +11 -2
  3. Dockerfile +75 -40
  4. README.md +2 -2
  5. pyproject.toml +4 -4
  6. requirements.txt +2 -2
.dockerignore CHANGED
@@ -4,10 +4,9 @@
4
  *.jpg
5
  *.png
6
  *.ipynb
 
7
  examples/*
8
  processing/*
9
- input/*
10
- output/*
11
  tools/__pycache__/*
12
  old_code/*
13
  tesseract/*
@@ -15,9 +14,18 @@ poppler/*
15
  build/*
16
  dist/*
17
  build_deps/*
18
- logs/*
19
- config/*
20
  user_guide/*
21
- cdk/*
22
  cdk/config/*
23
- web/*
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  *.jpg
5
  *.png
6
  *.ipynb
7
+ *.pyc
8
  examples/*
9
  processing/*
 
 
10
  tools/__pycache__/*
11
  old_code/*
12
  tesseract/*
 
14
  build/*
15
  dist/*
16
  build_deps/*
 
 
17
  user_guide/*
 
18
  cdk/config/*
19
+ tld/*
20
+ cdk/config/*
21
+ cdk/cdk.out/*
22
+ cdk/archive/*
23
+ cdk.json
24
+ cdk.context.json
25
+ .quarto/*
26
+ logs/
27
+ output/
28
+ input/
29
+ feedback/
30
+ config/
31
+ usage/
.gitignore CHANGED
@@ -4,6 +4,7 @@
4
  *.jpg
5
  *.png
6
  *.ipynb
 
7
  examples/*
8
  processing/*
9
  input/*
@@ -19,6 +20,14 @@ logs/*
19
  config/*
20
  doc_redaction_amplify_app/*
21
  user_guide/*
22
- cdk/*
23
  cdk/config/*
24
- web/*
 
 
 
 
 
 
 
 
 
 
4
  *.jpg
5
  *.png
6
  *.ipynb
7
+ *.pyc
8
  examples/*
9
  processing/*
10
  input/*
 
20
  config/*
21
  doc_redaction_amplify_app/*
22
  user_guide/*
 
23
  cdk/config/*
24
+ cdk/cdk.out/*
25
+ cdk/archive/*
26
+ tld/*
27
+ tmp/*
28
+ cdk.out/*
29
+ cdk.json
30
+ cdk.context.json
31
+ .quarto/*
32
+ /.quarto/
33
+ /_site/
Dockerfile CHANGED
@@ -1,14 +1,14 @@
1
  # Stage 1: Build dependencies and download models
2
  FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm AS builder
3
 
4
- # Install system dependencies. Need to specify -y for poppler to get it to install
5
  RUN apt-get update \
6
  && apt-get install -y \
7
  g++ \
8
  make \
9
  cmake \
10
  unzip \
11
- libcurl4-openssl-dev \
12
  git \
13
  && apt-get clean \
14
  && rm -rf /var/lib/apt/lists/*
@@ -17,28 +17,20 @@ WORKDIR /src
17
 
18
  COPY requirements.txt .
19
 
20
- RUN pip install --no-cache-dir --target=/install -r requirements.txt
21
-
22
- RUN rm requirements.txt
23
 
24
- # Add lambda_entrypoint.py to the container
25
  COPY lambda_entrypoint.py .
26
-
27
  COPY entrypoint.sh .
28
 
29
  # Stage 2: Final runtime image
30
  FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm
31
 
32
- # Define a build argument with a default value
33
  ARG APP_MODE=gradio
34
-
35
- # Echo the APP_MODE during the build to confirm its value
36
- RUN echo "APP_MODE is set to: ${APP_MODE}"
37
-
38
- # Set APP_MODE as an environment variable for runtime
39
  ENV APP_MODE=${APP_MODE}
40
 
41
- # Install system dependencies
42
  RUN apt-get update \
43
  && apt-get install -y \
44
  tesseract-ocr \
@@ -48,30 +40,85 @@ RUN apt-get update \
48
  && apt-get clean \
49
  && rm -rf /var/lib/apt/lists/*
50
 
51
- # Set up a new user named "user" with user ID 1000
52
  RUN useradd -m -u 1000 user
 
53
 
54
- # Create required directories
55
- RUN mkdir -p /home/user/app/{output,input,tld,logs,usage,feedback,config} \
56
- && chown -R user:user /home/user/app
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  # Copy installed packages from builder stage
59
  COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
60
 
61
- # Download NLTK data packages - now no longer necessary
62
- # RUN python -m nltk.downloader --quiet punkt stopwords punkt_tab
63
 
64
- # Entrypoint helps to switch between Gradio and Lambda mode
65
  COPY entrypoint.sh /entrypoint.sh
66
-
67
  RUN chmod +x /entrypoint.sh
68
 
69
- # Switch to the "user" user
70
  USER user
71
 
72
- ENV APP_HOME=/home/user
 
73
 
74
- # Set environmental variables
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  ENV PATH=$APP_HOME/.local/bin:$PATH \
76
  PYTHONPATH=$APP_HOME/app \
77
  PYTHONUNBUFFERED=1 \
@@ -80,20 +127,8 @@ ENV PATH=$APP_HOME/.local/bin:$PATH \
80
  GRADIO_NUM_PORTS=1 \
81
  GRADIO_SERVER_NAME=0.0.0.0 \
82
  GRADIO_SERVER_PORT=7860 \
83
- GRADIO_ANALYTICS_ENABLED=False \
84
- TLDEXTRACT_CACHE=$APP_HOME/app/tld/.tld_set_snapshot \
85
- SYSTEM=spaces
86
-
87
- # Set the working directory to the user's home directory
88
- WORKDIR $APP_HOME/app
89
-
90
- # Copy the app code to the container
91
- COPY --chown=user . $APP_HOME/app
92
-
93
- # Ensure permissions are really user:user again after copying
94
- RUN chown -R user:user $APP_HOME/app && chmod -R u+rwX $APP_HOME/app
95
 
96
- ENTRYPOINT [ "/entrypoint.sh" ]
97
 
98
- # Default command for Lambda mode
99
- CMD [ "lambda_entrypoint.lambda_handler" ]
 
1
  # Stage 1: Build dependencies and download models
2
  FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm AS builder
3
 
4
+ # Install system dependencies
5
  RUN apt-get update \
6
  && apt-get install -y \
7
  g++ \
8
  make \
9
  cmake \
10
  unzip \
11
+ libcurl4-openssl-dev \
12
  git \
13
  && apt-get clean \
14
  && rm -rf /var/lib/apt/lists/*
 
17
 
18
  COPY requirements.txt .
19
 
20
+ RUN pip install --no-cache-dir --target=/install -r requirements.txt && rm requirements.txt
 
 
21
 
22
+ # Add lambda entrypoint and script
23
  COPY lambda_entrypoint.py .
 
24
  COPY entrypoint.sh .
25
 
26
  # Stage 2: Final runtime image
27
  FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm
28
 
29
+ # Set build-time and runtime environment variable
30
  ARG APP_MODE=gradio
 
 
 
 
 
31
  ENV APP_MODE=${APP_MODE}
32
 
33
+ # Install runtime dependencies
34
  RUN apt-get update \
35
  && apt-get install -y \
36
  tesseract-ocr \
 
40
  && apt-get clean \
41
  && rm -rf /var/lib/apt/lists/*
42
 
43
+ # Create non-root user
44
  RUN useradd -m -u 1000 user
45
+ ENV APP_HOME=/home/user
46
 
47
+ # Set env variables for Gradio & other apps
48
+ ENV GRADIO_TEMP_DIR=/tmp/gradio_tmp/ \
49
+ TLDEXTRACT_CACHE=/tmp/tld/ \
50
+ MPLCONFIGDIR=/tmp/matplotlib_cache/ \
51
+ GRADIO_OUTPUT_FOLDER=$APP_HOME/app/output/ \
52
+ GRADIO_INPUT_FOLDER=$APP_HOME/app/input/ \
53
+ FEEDBACK_LOGS_FOLDER=$APP_HOME/app/feedback/ \
54
+ ACCESS_LOGS_FOLDER=$APP_HOME/app/logs/ \
55
+ USAGE_LOGS_FOLDER=$APP_HOME/app/usage/ \
56
+ CONFIG_FOLDER=$APP_HOME/app/config/ \
57
+ XDG_CACHE_HOME=/tmp/xdg_cache/user_1000
58
+
59
+ # Create the base application directory and set its ownership
60
+ RUN mkdir -p ${APP_HOME}/app && chown user:user ${APP_HOME}/app
61
+
62
+ # Create required sub-folders within the app directory and set their permissions
63
+ # This ensures these specific directories are owned by 'user'
64
+ RUN mkdir -p \
65
+ ${APP_HOME}/app/output \
66
+ ${APP_HOME}/app/input \
67
+ ${APP_HOME}/app/logs \
68
+ ${APP_HOME}/app/usage \
69
+ ${APP_HOME}/app/feedback \
70
+ ${APP_HOME}/app/config \
71
+ && chown user:user \
72
+ ${APP_HOME}/app/output \
73
+ ${APP_HOME}/app/input \
74
+ ${APP_HOME}/app/logs \
75
+ ${APP_HOME}/app/usage \
76
+ ${APP_HOME}/app/feedback \
77
+ ${APP_HOME}/app/config \
78
+ && chmod 755 \
79
+ ${APP_HOME}/app/output \
80
+ ${APP_HOME}/app/input \
81
+ ${APP_HOME}/app/logs \
82
+ ${APP_HOME}/app/usage \
83
+ ${APP_HOME}/app/feedback \
84
+ ${APP_HOME}/app/config
85
+
86
+ # Now handle the /tmp and /var/tmp directories and their subdirectories
87
+ RUN mkdir -p /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache /tmp /var/tmp ${XDG_CACHE_HOME} \
88
+ && chown user:user /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache ${XDG_CACHE_HOME} \
89
+ && chmod 1777 /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache \
90
+ && chmod 700 ${XDG_CACHE_HOME}
91
 
92
  # Copy installed packages from builder stage
93
  COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
94
 
95
+ # Copy app code and entrypoint with correct ownership
96
+ COPY --chown=user . $APP_HOME/app
97
 
98
+ # Copy and chmod entrypoint
99
  COPY entrypoint.sh /entrypoint.sh
 
100
  RUN chmod +x /entrypoint.sh
101
 
102
+ # Switch to user
103
  USER user
104
 
105
+ # Declare working directory
106
+ WORKDIR $APP_HOME/app
107
 
108
+ # Declare volumes (NOTE: runtime mounts will override permissions — handle with care)
109
+ VOLUME ["/tmp/matplotlib_cache"]
110
+ VOLUME ["/tmp/gradio_tmp"]
111
+ VOLUME ["/tmp/tld"]
112
+ VOLUME ["/home/user/app/output"]
113
+ VOLUME ["/home/user/app/input"]
114
+ VOLUME ["/home/user/app/logs"]
115
+ VOLUME ["/home/user/app/usage"]
116
+ VOLUME ["/home/user/app/feedback"]
117
+ VOLUME ["/home/user/app/config"]
118
+ VOLUME ["/tmp"]
119
+ VOLUME ["/var/tmp"]
120
+
121
+ # Set runtime environment
122
  ENV PATH=$APP_HOME/.local/bin:$PATH \
123
  PYTHONPATH=$APP_HOME/app \
124
  PYTHONUNBUFFERED=1 \
 
127
  GRADIO_NUM_PORTS=1 \
128
  GRADIO_SERVER_NAME=0.0.0.0 \
129
  GRADIO_SERVER_PORT=7860 \
130
+ GRADIO_ANALYTICS_ENABLED=False
 
 
 
 
 
 
 
 
 
 
 
131
 
132
+ ENTRYPOINT ["/entrypoint.sh"]
133
 
134
+ CMD ["lambda_entrypoint.lambda_handler"]
 
README.md CHANGED
@@ -5,12 +5,12 @@ colorFrom: blue
5
  colorTo: yellow
6
  sdk: docker
7
  app_file: app.py
8
- pinned: false
9
  license: agpl-3.0
10
  ---
11
  # Document redaction
12
 
13
- version: 0.6.8
14
 
15
  Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
16
 
 
5
  colorTo: yellow
6
  sdk: docker
7
  app_file: app.py
8
+ pinned: true
9
  license: agpl-3.0
10
  ---
11
  # Document redaction
12
 
13
+ version: 0.7.0
14
 
15
  Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
16
 
pyproject.toml CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
 
5
  [project]
6
  name = "doc_redaction"
7
- version = "0.6.8"
8
  description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
9
  readme = "README.md"
10
  requires-python = ">=3.10"
@@ -23,8 +23,8 @@ dependencies = [
23
  "spacy==3.8.4",
24
  # Direct URL dependency for spacy model
25
  "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
26
- "gradio==5.29.1",
27
- "boto3==1.38.4",
28
  "pyarrow==19.0.1",
29
  "openpyxl==3.1.5",
30
  "Faker==36.1.1",
@@ -39,7 +39,7 @@ dependencies = [
39
  ]
40
 
41
  [project.urls]
42
- Homepage = "https://seanpedrick-case.github.io/doc_redaction/README.html"
43
  repository = "https://github.com/seanpedrick-case/doc_redaction"
44
 
45
  [project.optional-dependencies]
 
4
 
5
  [project]
6
  name = "doc_redaction"
7
+ version = "0.7.0"
8
  description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
9
  readme = "README.md"
10
  requires-python = ">=3.10"
 
23
  "spacy==3.8.4",
24
  # Direct URL dependency for spacy model
25
  "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
26
+ "gradio==5.33.2",
27
+ "boto3==1.38.35",
28
  "pyarrow==19.0.1",
29
  "openpyxl==3.1.5",
30
  "Faker==36.1.1",
 
39
  ]
40
 
41
  [project.urls]
42
+ Homepage = "https://seanpedrick-case.github.io/doc_redaction/"
43
  repository = "https://github.com/seanpedrick-case/doc_redaction"
44
 
45
  [project.optional-dependencies]
requirements.txt CHANGED
@@ -10,8 +10,8 @@ pandas==2.2.3
10
  scikit-learn==1.6.1
11
  spacy==3.8.4
12
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
13
- gradio==5.29.1
14
- boto3==1.38.4
15
  pyarrow==19.0.1
16
  openpyxl==3.1.5
17
  Faker==36.1.1
 
10
  scikit-learn==1.6.1
11
  spacy==3.8.4
12
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
13
+ gradio==5.33.2
14
+ boto3==1.38.35
15
  pyarrow==19.0.1
16
  openpyxl==3.1.5
17
  Faker==36.1.1