grobid-papercheck / Dockerfile.evaluation
Jakub Werner
activated LFS and pushing
c6f9cb5
## Grobid evaluation image
# ------
# https://grobid.readthedocs.io/en/latest/End-to-end-evaluation/
# NOTE: To match the exact evaluation published in the Grobid documentation is necessary to have a
# running Biblio-glutton instance
#
# A project using this image can be found here: https://huggingface.co/spaces/lfoppiano/grobid-evaluation
# Please notice that the evaluation is run through a python script that runs all the needed commands
# TODO: upload the evaluation in Markdown somewhere
# -------------------
# build builder image
# -------------------
FROM eclipse-temurin:17.0.15_6-jdk AS builder
USER root
RUN apt-get update && \
apt-get -y upgrade && \
apt-get -y --no-install-recommends install unzip git
WORKDIR /opt/grobid
# gradle
COPY gradle/ ./gradle/
COPY gradlew ./
COPY gradle.properties ./
COPY build.gradle ./
COPY settings.gradle ./
COPY .git/ ./.git
# source
COPY grobid-home/ ./grobid-home/
COPY grobid-core/ ./grobid-core/
COPY grobid-service/ ./grobid-service/
COPY grobid-trainer/ ./grobid-trainer/
# cleaning unused native libraries before packaging
RUN rm -rf grobid-home/pdf2xml
RUN rm -rf grobid-home/pdfalto/lin-32
RUN rm -rf grobid-home/pdfalto/mac-64
RUN rm -rf grobid-home/pdfalto/mac_arm-64
RUN rm -rf grobid-home/pdfalto/win-*
RUN rm -rf grobid-home/lib/lin-32
RUN rm -rf grobid-home/lib/win-*
RUN rm -rf grobid-home/lib/mac-64
# Setting DL-powered configuration
RUN rm grobid-home/config/grobid.yaml && \
mv grobid-home/config/grobid-evaluation.yaml grobid-home/config/grobid.yaml
# Download evaluation data (For space reasons, we are not downloading the evaluation data) \
# See https://huggingface.co/spaces/lfoppiano/grobid-evaluation/blob/main/Dockerfile
WORKDIR /opt/grobid/evaluation
#RUN git lfs install && git clone --depth 1 https://huggingface.co/datasets/sciencialab/grobid-evaluation evaluation
#RUN chmod -R uog+rw /opt/grobid/evaluation
# -------------------
# build runtime image
# -------------------
# use NVIDIA Container Toolkit to automatically recognize possible GPU drivers on the host machine
FROM tensorflow/tensorflow:2.7.0-gpu
# setting locale is likely useless but to be sure
ENV LANG C.UTF-8
# update NVIDIA Cuda key (following a key rotation in April 2022)
RUN apt-get install -y wget && \
apt-key del 7fa2af80 && \
rm /etc/apt/sources.list.d/cuda.list && \
rm /etc/apt/sources.list.d/nvidia-ml.list && \
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb && \
dpkg -i cuda-keyring_1.1-1_all.deb
# install python and other dependencies
RUN apt-get update && \
apt-mark hold libcudnn8 && \
apt-get -y upgrade && \
apt-get -y --no-install-recommends install \
bash apt-utils build-essential gcc libxml2 libfontconfig unzip curl \
musl gfortran \
python3 python3-pip python3-setuptools python3-dev \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /opt/grobid
COPY --from=builder /opt/grobid .
RUN python3 -m pip install pip --upgrade --no-cache-dir
# install DeLFT via pypi
RUN pip3 install --no-cache-dir requests delft==0.3.3
# link the data directory to /data
# the current working directory will most likely be /opt/grobid
RUN mkdir -p /data \
&& ln -s /data /opt/grobid/data \
&& ln -s /data ./data
# disable python warnings (and fix logging)
ENV PYTHONWARNINGS="ignore"
ENV JAVA_HOME=/opt/java/openjdk
ENV PATH=$JAVA_HOME/bin:$PATH
ENV JAVA_OPTS=-Xmx4g
# Get java jdk from builder image
# See tag of builder image in [Dockerfile tag](https://github.com/docker-library/docs/blob/master/eclipse-temurin/README.md#simple-tags) to find JAVA_HOME to copy from builder image.
COPY --from=builder /opt/java/openjdk ${JAVA_HOME}
# install jep
RUN pip3 install jep==4.0.3
ENV LD_LIBRARY_PATH=/usr/local/lib/python3.8/dist-packages/jep:grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep:${LD_LIBRARY_PATH}
# remove libjep.so because we are providing our own version in the virtual env above
RUN rm /opt/grobid/grobid-home/lib/lin-64/jep/libjep.so
# Add Tini
ENV TINI_VERSION v0.19.0
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
RUN chmod +x /tini
ENTRYPOINT ["/tini", "-s", "--"]
WORKDIR /opt/grobid
# preload embeddings, for GROBID all the RNN models use glove-840B (default for the script), ELMo is currently not loaded
# to be done: mechanism to download GROBID fine-tuned models based on SciBERT if selected (but not good enough for the moment)
COPY --from=builder /opt/grobid/grobid-home/scripts/preload_embeddings.py .
COPY --from=builder /opt/grobid/grobid-home/config/resources-registry.json .
RUN python3 preload_embeddings.py --registry ./resources-registry.json && \
ln -s /opt/grobid /opt/delft
RUN mkdir delft && \
cp ./resources-registry.json delft/
VOLUME ["/opt/grobid/grobid-home/tmp"]
LABEL \
authors="The contributors" \
org.label-schema.name="Grobid" \
org.label-schema.description="Image running the Grobid End 2 end evaluation" \
org.label-schema.url="https://github.com/kermitt2/Grobid" \
org.label-schema.version=${GROBID_VERSION}