Commit
·
a7566b9
1
Parent(s):
95ca426
Adapted Dockerfile for systems with read only file system. Minor package updates.
Browse files- .dockerignore +14 -6
- .gitignore +11 -2
- Dockerfile +75 -40
- README.md +2 -2
- pyproject.toml +4 -4
- requirements.txt +2 -2
.dockerignore
CHANGED
@@ -4,10 +4,9 @@
|
|
4 |
*.jpg
|
5 |
*.png
|
6 |
*.ipynb
|
|
|
7 |
examples/*
|
8 |
processing/*
|
9 |
-
input/*
|
10 |
-
output/*
|
11 |
tools/__pycache__/*
|
12 |
old_code/*
|
13 |
tesseract/*
|
@@ -15,9 +14,18 @@ poppler/*
|
|
15 |
build/*
|
16 |
dist/*
|
17 |
build_deps/*
|
18 |
-
logs/*
|
19 |
-
config/*
|
20 |
user_guide/*
|
21 |
-
cdk/*
|
22 |
cdk/config/*
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
*.jpg
|
5 |
*.png
|
6 |
*.ipynb
|
7 |
+
*.pyc
|
8 |
examples/*
|
9 |
processing/*
|
|
|
|
|
10 |
tools/__pycache__/*
|
11 |
old_code/*
|
12 |
tesseract/*
|
|
|
14 |
build/*
|
15 |
dist/*
|
16 |
build_deps/*
|
|
|
|
|
17 |
user_guide/*
|
|
|
18 |
cdk/config/*
|
19 |
+
tld/*
|
20 |
+
cdk/config/*
|
21 |
+
cdk/cdk.out/*
|
22 |
+
cdk/archive/*
|
23 |
+
cdk.json
|
24 |
+
cdk.context.json
|
25 |
+
.quarto/*
|
26 |
+
logs/
|
27 |
+
output/
|
28 |
+
input/
|
29 |
+
feedback/
|
30 |
+
config/
|
31 |
+
usage/
|
.gitignore
CHANGED
@@ -4,6 +4,7 @@
|
|
4 |
*.jpg
|
5 |
*.png
|
6 |
*.ipynb
|
|
|
7 |
examples/*
|
8 |
processing/*
|
9 |
input/*
|
@@ -19,6 +20,14 @@ logs/*
|
|
19 |
config/*
|
20 |
doc_redaction_amplify_app/*
|
21 |
user_guide/*
|
22 |
-
cdk/*
|
23 |
cdk/config/*
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
*.jpg
|
5 |
*.png
|
6 |
*.ipynb
|
7 |
+
*.pyc
|
8 |
examples/*
|
9 |
processing/*
|
10 |
input/*
|
|
|
20 |
config/*
|
21 |
doc_redaction_amplify_app/*
|
22 |
user_guide/*
|
|
|
23 |
cdk/config/*
|
24 |
+
cdk/cdk.out/*
|
25 |
+
cdk/archive/*
|
26 |
+
tld/*
|
27 |
+
tmp/*
|
28 |
+
cdk.out/*
|
29 |
+
cdk.json
|
30 |
+
cdk.context.json
|
31 |
+
.quarto/*
|
32 |
+
/.quarto/
|
33 |
+
/_site/
|
Dockerfile
CHANGED
@@ -1,14 +1,14 @@
|
|
1 |
# Stage 1: Build dependencies and download models
|
2 |
FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm AS builder
|
3 |
|
4 |
-
# Install system dependencies
|
5 |
RUN apt-get update \
|
6 |
&& apt-get install -y \
|
7 |
g++ \
|
8 |
make \
|
9 |
cmake \
|
10 |
unzip \
|
11 |
-
libcurl4-openssl-dev \
|
12 |
git \
|
13 |
&& apt-get clean \
|
14 |
&& rm -rf /var/lib/apt/lists/*
|
@@ -17,28 +17,20 @@ WORKDIR /src
|
|
17 |
|
18 |
COPY requirements.txt .
|
19 |
|
20 |
-
RUN pip install --no-cache-dir --target=/install -r requirements.txt
|
21 |
-
|
22 |
-
RUN rm requirements.txt
|
23 |
|
24 |
-
# Add
|
25 |
COPY lambda_entrypoint.py .
|
26 |
-
|
27 |
COPY entrypoint.sh .
|
28 |
|
29 |
# Stage 2: Final runtime image
|
30 |
FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm
|
31 |
|
32 |
-
#
|
33 |
ARG APP_MODE=gradio
|
34 |
-
|
35 |
-
# Echo the APP_MODE during the build to confirm its value
|
36 |
-
RUN echo "APP_MODE is set to: ${APP_MODE}"
|
37 |
-
|
38 |
-
# Set APP_MODE as an environment variable for runtime
|
39 |
ENV APP_MODE=${APP_MODE}
|
40 |
|
41 |
-
# Install
|
42 |
RUN apt-get update \
|
43 |
&& apt-get install -y \
|
44 |
tesseract-ocr \
|
@@ -48,30 +40,85 @@ RUN apt-get update \
|
|
48 |
&& apt-get clean \
|
49 |
&& rm -rf /var/lib/apt/lists/*
|
50 |
|
51 |
-
#
|
52 |
RUN useradd -m -u 1000 user
|
|
|
53 |
|
54 |
-
#
|
55 |
-
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
# Copy installed packages from builder stage
|
59 |
COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
|
60 |
|
61 |
-
#
|
62 |
-
|
63 |
|
64 |
-
#
|
65 |
COPY entrypoint.sh /entrypoint.sh
|
66 |
-
|
67 |
RUN chmod +x /entrypoint.sh
|
68 |
|
69 |
-
# Switch to
|
70 |
USER user
|
71 |
|
72 |
-
|
|
|
73 |
|
74 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
ENV PATH=$APP_HOME/.local/bin:$PATH \
|
76 |
PYTHONPATH=$APP_HOME/app \
|
77 |
PYTHONUNBUFFERED=1 \
|
@@ -80,20 +127,8 @@ ENV PATH=$APP_HOME/.local/bin:$PATH \
|
|
80 |
GRADIO_NUM_PORTS=1 \
|
81 |
GRADIO_SERVER_NAME=0.0.0.0 \
|
82 |
GRADIO_SERVER_PORT=7860 \
|
83 |
-
GRADIO_ANALYTICS_ENABLED=False
|
84 |
-
TLDEXTRACT_CACHE=$APP_HOME/app/tld/.tld_set_snapshot \
|
85 |
-
SYSTEM=spaces
|
86 |
-
|
87 |
-
# Set the working directory to the user's home directory
|
88 |
-
WORKDIR $APP_HOME/app
|
89 |
-
|
90 |
-
# Copy the app code to the container
|
91 |
-
COPY --chown=user . $APP_HOME/app
|
92 |
-
|
93 |
-
# Ensure permissions are really user:user again after copying
|
94 |
-
RUN chown -R user:user $APP_HOME/app && chmod -R u+rwX $APP_HOME/app
|
95 |
|
96 |
-
ENTRYPOINT [
|
97 |
|
98 |
-
|
99 |
-
CMD [ "lambda_entrypoint.lambda_handler" ]
|
|
|
1 |
# Stage 1: Build dependencies and download models
|
2 |
FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm AS builder
|
3 |
|
4 |
+
# Install system dependencies
|
5 |
RUN apt-get update \
|
6 |
&& apt-get install -y \
|
7 |
g++ \
|
8 |
make \
|
9 |
cmake \
|
10 |
unzip \
|
11 |
+
libcurl4-openssl-dev \
|
12 |
git \
|
13 |
&& apt-get clean \
|
14 |
&& rm -rf /var/lib/apt/lists/*
|
|
|
17 |
|
18 |
COPY requirements.txt .
|
19 |
|
20 |
+
RUN pip install --no-cache-dir --target=/install -r requirements.txt && rm requirements.txt
|
|
|
|
|
21 |
|
22 |
+
# Add lambda entrypoint and script
|
23 |
COPY lambda_entrypoint.py .
|
|
|
24 |
COPY entrypoint.sh .
|
25 |
|
26 |
# Stage 2: Final runtime image
|
27 |
FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm
|
28 |
|
29 |
+
# Set build-time and runtime environment variable
|
30 |
ARG APP_MODE=gradio
|
|
|
|
|
|
|
|
|
|
|
31 |
ENV APP_MODE=${APP_MODE}
|
32 |
|
33 |
+
# Install runtime dependencies
|
34 |
RUN apt-get update \
|
35 |
&& apt-get install -y \
|
36 |
tesseract-ocr \
|
|
|
40 |
&& apt-get clean \
|
41 |
&& rm -rf /var/lib/apt/lists/*
|
42 |
|
43 |
+
# Create non-root user
|
44 |
RUN useradd -m -u 1000 user
|
45 |
+
ENV APP_HOME=/home/user
|
46 |
|
47 |
+
# Set env variables for Gradio & other apps
|
48 |
+
ENV GRADIO_TEMP_DIR=/tmp/gradio_tmp/ \
|
49 |
+
TLDEXTRACT_CACHE=/tmp/tld/ \
|
50 |
+
MPLCONFIGDIR=/tmp/matplotlib_cache/ \
|
51 |
+
GRADIO_OUTPUT_FOLDER=$APP_HOME/app/output/ \
|
52 |
+
GRADIO_INPUT_FOLDER=$APP_HOME/app/input/ \
|
53 |
+
FEEDBACK_LOGS_FOLDER=$APP_HOME/app/feedback/ \
|
54 |
+
ACCESS_LOGS_FOLDER=$APP_HOME/app/logs/ \
|
55 |
+
USAGE_LOGS_FOLDER=$APP_HOME/app/usage/ \
|
56 |
+
CONFIG_FOLDER=$APP_HOME/app/config/ \
|
57 |
+
XDG_CACHE_HOME=/tmp/xdg_cache/user_1000
|
58 |
+
|
59 |
+
# Create the base application directory and set its ownership
|
60 |
+
RUN mkdir -p ${APP_HOME}/app && chown user:user ${APP_HOME}/app
|
61 |
+
|
62 |
+
# Create required sub-folders within the app directory and set their permissions
|
63 |
+
# This ensures these specific directories are owned by 'user'
|
64 |
+
RUN mkdir -p \
|
65 |
+
${APP_HOME}/app/output \
|
66 |
+
${APP_HOME}/app/input \
|
67 |
+
${APP_HOME}/app/logs \
|
68 |
+
${APP_HOME}/app/usage \
|
69 |
+
${APP_HOME}/app/feedback \
|
70 |
+
${APP_HOME}/app/config \
|
71 |
+
&& chown user:user \
|
72 |
+
${APP_HOME}/app/output \
|
73 |
+
${APP_HOME}/app/input \
|
74 |
+
${APP_HOME}/app/logs \
|
75 |
+
${APP_HOME}/app/usage \
|
76 |
+
${APP_HOME}/app/feedback \
|
77 |
+
${APP_HOME}/app/config \
|
78 |
+
&& chmod 755 \
|
79 |
+
${APP_HOME}/app/output \
|
80 |
+
${APP_HOME}/app/input \
|
81 |
+
${APP_HOME}/app/logs \
|
82 |
+
${APP_HOME}/app/usage \
|
83 |
+
${APP_HOME}/app/feedback \
|
84 |
+
${APP_HOME}/app/config
|
85 |
+
|
86 |
+
# Now handle the /tmp and /var/tmp directories and their subdirectories
|
87 |
+
RUN mkdir -p /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache /tmp /var/tmp ${XDG_CACHE_HOME} \
|
88 |
+
&& chown user:user /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache ${XDG_CACHE_HOME} \
|
89 |
+
&& chmod 1777 /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache \
|
90 |
+
&& chmod 700 ${XDG_CACHE_HOME}
|
91 |
|
92 |
# Copy installed packages from builder stage
|
93 |
COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
|
94 |
|
95 |
+
# Copy app code and entrypoint with correct ownership
|
96 |
+
COPY --chown=user . $APP_HOME/app
|
97 |
|
98 |
+
# Copy and chmod entrypoint
|
99 |
COPY entrypoint.sh /entrypoint.sh
|
|
|
100 |
RUN chmod +x /entrypoint.sh
|
101 |
|
102 |
+
# Switch to user
|
103 |
USER user
|
104 |
|
105 |
+
# Declare working directory
|
106 |
+
WORKDIR $APP_HOME/app
|
107 |
|
108 |
+
# Declare volumes (NOTE: runtime mounts will override permissions — handle with care)
|
109 |
+
VOLUME ["/tmp/matplotlib_cache"]
|
110 |
+
VOLUME ["/tmp/gradio_tmp"]
|
111 |
+
VOLUME ["/tmp/tld"]
|
112 |
+
VOLUME ["/home/user/app/output"]
|
113 |
+
VOLUME ["/home/user/app/input"]
|
114 |
+
VOLUME ["/home/user/app/logs"]
|
115 |
+
VOLUME ["/home/user/app/usage"]
|
116 |
+
VOLUME ["/home/user/app/feedback"]
|
117 |
+
VOLUME ["/home/user/app/config"]
|
118 |
+
VOLUME ["/tmp"]
|
119 |
+
VOLUME ["/var/tmp"]
|
120 |
+
|
121 |
+
# Set runtime environment
|
122 |
ENV PATH=$APP_HOME/.local/bin:$PATH \
|
123 |
PYTHONPATH=$APP_HOME/app \
|
124 |
PYTHONUNBUFFERED=1 \
|
|
|
127 |
GRADIO_NUM_PORTS=1 \
|
128 |
GRADIO_SERVER_NAME=0.0.0.0 \
|
129 |
GRADIO_SERVER_PORT=7860 \
|
130 |
+
GRADIO_ANALYTICS_ENABLED=False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
|
132 |
+
ENTRYPOINT ["/entrypoint.sh"]
|
133 |
|
134 |
+
CMD ["lambda_entrypoint.lambda_handler"]
|
|
README.md
CHANGED
@@ -5,12 +5,12 @@ colorFrom: blue
|
|
5 |
colorTo: yellow
|
6 |
sdk: docker
|
7 |
app_file: app.py
|
8 |
-
pinned:
|
9 |
license: agpl-3.0
|
10 |
---
|
11 |
# Document redaction
|
12 |
|
13 |
-
version: 0.
|
14 |
|
15 |
Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
|
16 |
|
|
|
5 |
colorTo: yellow
|
6 |
sdk: docker
|
7 |
app_file: app.py
|
8 |
+
pinned: true
|
9 |
license: agpl-3.0
|
10 |
---
|
11 |
# Document redaction
|
12 |
|
13 |
+
version: 0.7.0
|
14 |
|
15 |
Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
|
16 |
|
pyproject.toml
CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4 |
|
5 |
[project]
|
6 |
name = "doc_redaction"
|
7 |
-
version = "0.
|
8 |
description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
|
9 |
readme = "README.md"
|
10 |
requires-python = ">=3.10"
|
@@ -23,8 +23,8 @@ dependencies = [
|
|
23 |
"spacy==3.8.4",
|
24 |
# Direct URL dependency for spacy model
|
25 |
"en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
|
26 |
-
"gradio==5.
|
27 |
-
"boto3==1.38.
|
28 |
"pyarrow==19.0.1",
|
29 |
"openpyxl==3.1.5",
|
30 |
"Faker==36.1.1",
|
@@ -39,7 +39,7 @@ dependencies = [
|
|
39 |
]
|
40 |
|
41 |
[project.urls]
|
42 |
-
Homepage = "https://seanpedrick-case.github.io/doc_redaction/
|
43 |
repository = "https://github.com/seanpedrick-case/doc_redaction"
|
44 |
|
45 |
[project.optional-dependencies]
|
|
|
4 |
|
5 |
[project]
|
6 |
name = "doc_redaction"
|
7 |
+
version = "0.7.0"
|
8 |
description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
|
9 |
readme = "README.md"
|
10 |
requires-python = ">=3.10"
|
|
|
23 |
"spacy==3.8.4",
|
24 |
# Direct URL dependency for spacy model
|
25 |
"en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
|
26 |
+
"gradio==5.33.2",
|
27 |
+
"boto3==1.38.35",
|
28 |
"pyarrow==19.0.1",
|
29 |
"openpyxl==3.1.5",
|
30 |
"Faker==36.1.1",
|
|
|
39 |
]
|
40 |
|
41 |
[project.urls]
|
42 |
+
Homepage = "https://seanpedrick-case.github.io/doc_redaction/"
|
43 |
repository = "https://github.com/seanpedrick-case/doc_redaction"
|
44 |
|
45 |
[project.optional-dependencies]
|
requirements.txt
CHANGED
@@ -10,8 +10,8 @@ pandas==2.2.3
|
|
10 |
scikit-learn==1.6.1
|
11 |
spacy==3.8.4
|
12 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
13 |
-
gradio==5.
|
14 |
-
boto3==1.38.
|
15 |
pyarrow==19.0.1
|
16 |
openpyxl==3.1.5
|
17 |
Faker==36.1.1
|
|
|
10 |
scikit-learn==1.6.1
|
11 |
spacy==3.8.4
|
12 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
13 |
+
gradio==5.33.2
|
14 |
+
boto3==1.38.35
|
15 |
pyarrow==19.0.1
|
16 |
openpyxl==3.1.5
|
17 |
Faker==36.1.1
|