Added tika jar into image to avoid downloading (#3167)
Browse files### What problem does this PR solve?
Added tika jar into image to avoid downloading. Close #3017
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
- .github/workflows/tests.yml +1 -1
- Dockerfile +5 -0
- Dockerfile.slim +5 -0
- download_deps.py +2 -0
.github/workflows/tests.yml
CHANGED
|
@@ -48,7 +48,7 @@ jobs:
|
|
| 48 |
- name: Build ragflow:dev-slim
|
| 49 |
run: |
|
| 50 |
RUNNER_WORKSPACE_PREFIX=${RUNNER_WORKSPACE_PREFIX:-$HOME}
|
| 51 |
-
cp -r ${RUNNER_WORKSPACE_PREFIX}/huggingface.co ${RUNNER_WORKSPACE_PREFIX}/nltk_data ${RUNNER_WORKSPACE_PREFIX}/libssl*.deb .
|
| 52 |
sudo docker pull ubuntu:24.04
|
| 53 |
sudo docker build -f Dockerfile.slim -t infiniflow/ragflow:dev-slim .
|
| 54 |
|
|
|
|
| 48 |
- name: Build ragflow:dev-slim
|
| 49 |
run: |
|
| 50 |
RUNNER_WORKSPACE_PREFIX=${RUNNER_WORKSPACE_PREFIX:-$HOME}
|
| 51 |
+
cp -r ${RUNNER_WORKSPACE_PREFIX}/huggingface.co ${RUNNER_WORKSPACE_PREFIX}/nltk_data ${RUNNER_WORKSPACE_PREFIX}/libssl*.deb ${RUNNER_WORKSPACE_PREFIX}/tika-server*.jar* .
|
| 52 |
sudo docker pull ubuntu:24.04
|
| 53 |
sudo docker build -f Dockerfile.slim -t infiniflow/ragflow:dev-slim .
|
| 54 |
|
Dockerfile
CHANGED
|
@@ -104,6 +104,11 @@ RUN --mount=type=bind,source=huggingface.co,target=/huggingface.co \
|
|
| 104 |
# Copy nltk data downloaded via download_deps.py
|
| 105 |
COPY nltk_data /root/nltk_data
|
| 106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
# Copy compiled web pages
|
| 108 |
COPY --from=builder /ragflow/web/dist /ragflow/web/dist
|
| 109 |
|
|
|
|
| 104 |
# Copy nltk data downloaded via download_deps.py
|
| 105 |
COPY nltk_data /root/nltk_data
|
| 106 |
|
| 107 |
+
# https://github.com/chrismattmann/tika-python
|
| 108 |
+
# This is the only way to run python-tika without internet access. Without this set, the default is to check the tika version and pull latest every time from Apache.
|
| 109 |
+
COPY tika-server-standard-3.0.0.jar tika-server-standard-3.0.0.jar.md5 ./
|
| 110 |
+
ENV TIKA_SERVER_JAR="file:///ragflow/tika-server-standard.jar"
|
| 111 |
+
|
| 112 |
# Copy compiled web pages
|
| 113 |
COPY --from=builder /ragflow/web/dist /ragflow/web/dist
|
| 114 |
|
Dockerfile.slim
CHANGED
|
@@ -97,6 +97,11 @@ RUN --mount=type=bind,source=huggingface.co,target=/huggingface.co \
|
|
| 97 |
# Copy nltk data downloaded via download_deps.py
|
| 98 |
COPY nltk_data /root/nltk_data
|
| 99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
# Copy compiled web pages
|
| 101 |
COPY --from=builder /ragflow/web/dist /ragflow/web/dist
|
| 102 |
|
|
|
|
| 97 |
# Copy nltk data downloaded via download_deps.py
|
| 98 |
COPY nltk_data /root/nltk_data
|
| 99 |
|
| 100 |
+
# https://github.com/chrismattmann/tika-python
|
| 101 |
+
# This is the only way to run python-tika without internet access. Without this set, the default is to check the tika version and pull latest every time from Apache.
|
| 102 |
+
COPY tika-server-standard-3.0.0.jar tika-server-standard-3.0.0.jar.md5 ./
|
| 103 |
+
ENV TIKA_SERVER_JAR="file:///ragflow/tika-server-standard.jar"
|
| 104 |
+
|
| 105 |
# Copy compiled web pages
|
| 106 |
COPY --from=builder /ragflow/web/dist /ragflow/web/dist
|
| 107 |
|
download_deps.py
CHANGED
|
@@ -7,6 +7,8 @@ import urllib.request
|
|
| 7 |
|
| 8 |
urls = [
|
| 9 |
"http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb",
|
|
|
|
|
|
|
| 10 |
]
|
| 11 |
|
| 12 |
repos = [
|
|
|
|
| 7 |
|
| 8 |
urls = [
|
| 9 |
"http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb",
|
| 10 |
+
"https://repo1.maven.org/maven2/org/apache/tika/tika-server-standard/3.0.0/tika-server-standard-3.0.0.jar",
|
| 11 |
+
"https://repo1.maven.org/maven2/org/apache/tika/tika-server-standard/3.0.0/tika-server-standard-3.0.0.jar.md5",
|
| 12 |
]
|
| 13 |
|
| 14 |
repos = [
|