2021-02-20 00:22:12 +01:00
|
|
|
version: '3'
|
|
|
|
|
|
|
|
silent: true
|
|
|
|
|
2021-02-23 22:45:03 +01:00
|
|
|
env:
|
2022-04-06 13:30:59 +02:00
|
|
|
OPENREFINE_MEMORY: 5120M
|
|
|
|
OPENREFINE_PORT: 3333
|
|
|
|
OPENREFINE_APPDIR:
|
|
|
|
sh: readlink -m .openrefine
|
|
|
|
OPENREFINE_TMPDIR:
|
2022-04-06 20:43:30 +02:00
|
|
|
sh: mkdir -p .openrefine/tmp; readlink -m .openrefine/tmp
|
2021-02-23 22:45:03 +01:00
|
|
|
|
2021-02-20 00:22:12 +01:00
|
|
|
tasks:
|
|
|
|
default:
|
2022-04-06 20:43:30 +02:00
|
|
|
desc: run workflow in batch mode
|
2021-02-20 00:22:12 +01:00
|
|
|
cmds:
|
2022-04-06 20:43:30 +02:00
|
|
|
- defer: { task: stop } # will always be executed last
|
2022-04-06 13:30:59 +02:00
|
|
|
- task: start
|
2022-04-06 20:43:30 +02:00
|
|
|
- task: example
|
2022-04-06 13:30:59 +02:00
|
|
|
sources:
|
|
|
|
- Taskfile.yml
|
|
|
|
- input/**
|
|
|
|
- config/**
|
|
|
|
generates:
|
|
|
|
- output/**
|
2022-04-06 20:43:30 +02:00
|
|
|
preconditions:
|
|
|
|
- sh: test -f "${OPENREFINE_APPDIR}/refine"
|
|
|
|
msg: "OpenRefine missing; try task install"
|
2021-02-20 00:22:12 +01:00
|
|
|
|
|
|
|
start:
|
2022-04-06 20:43:30 +02:00
|
|
|
- echo "start OpenRefine with max. $OPENREFINE_MEMORY on port $OPENREFINE_PORT..."
|
2022-04-06 13:30:59 +02:00
|
|
|
- | # launch OpenRefine with specific data directory and redirect its output to a log file
|
2022-04-06 20:43:30 +02:00
|
|
|
"${OPENREFINE_APPDIR}/refine" -v warn -p "$OPENREFINE_PORT" -m "$OPENREFINE_MEMORY" -d "${OPENREFINE_TMPDIR}" > "${OPENREFINE_TMPDIR}/log.txt" 2>&1 &
|
2021-02-20 00:22:12 +01:00
|
|
|
- | # wait until OpenRefine API is available
|
2022-04-06 13:30:59 +02:00
|
|
|
timeout 30s bash -c "until wget -q -O - -o /dev/null http://localhost:${OPENREFINE_PORT} | cat | grep -q -o OpenRefine; do sleep 1; done"
|
|
|
|
|
2022-04-06 20:43:30 +02:00
|
|
|
example:
|
2022-04-06 13:30:59 +02:00
|
|
|
- | # import (requires absolute path)
|
|
|
|
"${OPENREFINE_APPDIR}/client" \
|
|
|
|
--create "$(readlink -m input/duplicates.csv)" \
|
2022-04-06 20:43:30 +02:00
|
|
|
--projectName example
|
2022-04-06 13:30:59 +02:00
|
|
|
- | # apply undo/redo history
|
|
|
|
for f in config/*.json; do
|
2022-04-06 20:43:30 +02:00
|
|
|
"${OPENREFINE_APPDIR}/client" example --apply "$f"
|
2022-04-06 13:30:59 +02:00
|
|
|
done
|
|
|
|
- | # export to TSV
|
2022-04-06 20:43:30 +02:00
|
|
|
mkdir -p output
|
|
|
|
"${OPENREFINE_APPDIR}/client" example \
|
|
|
|
--output output/deduped.tsv
|
2022-04-06 13:30:59 +02:00
|
|
|
|
2022-04-06 20:43:30 +02:00
|
|
|
stop:
|
|
|
|
- defer: rm -rf "${OPENREFINE_TMPDIR}"
|
|
|
|
- | # print stats and kill OpenRefine immediately
|
2022-04-06 13:30:59 +02:00
|
|
|
PID="$(lsof -t -i:${OPENREFINE_PORT})"
|
2022-04-06 20:43:30 +02:00
|
|
|
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM"
|
|
|
|
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time"
|
|
|
|
kill -9 $PID
|
2022-04-06 13:30:59 +02:00
|
|
|
- | # check log file for any warnings
|
|
|
|
if grep -i 'exception\|error' "${OPENREFINE_TMPDIR}/log.txt"
|
|
|
|
then echo 1>&2 "log contains warnings!"; echo; cat "${OPENREFINE_TMPDIR}/log.txt"; exit 1
|
|
|
|
fi
|
2021-02-20 00:22:12 +01:00
|
|
|
|
2022-04-06 20:43:30 +02:00
|
|
|
install:
|
|
|
|
desc: install OpenRefine and openrefine-client into subdirectory ${OPENREFINE_APPDIR}
|
|
|
|
cmds:
|
|
|
|
- mkdir -p "${OPENREFINE_APPDIR}"
|
|
|
|
- | # install OpenRefine into subdirectory ${OPENREFINE_APPDIR}
|
|
|
|
wget --no-verbose -O openrefine.tar.gz https://github.com/OpenRefine/OpenRefine/releases/download/3.5.2/openrefine-linux-3.5.2.tar.gz
|
|
|
|
tar -xzf openrefine.tar.gz -C "${OPENREFINE_APPDIR}" --strip 1 && rm openrefine.tar.gz
|
|
|
|
- | # optimize OpenRefine for batch processing
|
|
|
|
sed -i 's/cd `dirname $0`/cd "$(dirname "$0")"/' "${OPENREFINE_APPDIR}/refine" # fix path issue in OpenRefine startup file
|
|
|
|
sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' "${OPENREFINE_APPDIR}/refine.ini" # do not try to open OpenRefine in browser
|
|
|
|
sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1440/' "${OPENREFINE_APPDIR}/refine.ini" # set autosave period from 5 minutes to 25 hours
|
|
|
|
- | # install openrefine-client into subdirectory ${OPENREFINE_APPDIR}
|
|
|
|
wget --no-verbose -O "${OPENREFINE_APPDIR}/client" https://github.com/opencultureconsulting/openrefine-client/releases/download/v0.3.10/openrefine-client_0-3-10_linux
|
|
|
|
chmod +x "${OPENREFINE_APPDIR}/client"
|
2022-04-06 13:30:59 +02:00
|
|
|
|
|
|
|
git:
|
|
|
|
desc: commit and push if something changed
|
2021-02-20 00:22:12 +01:00
|
|
|
cmds:
|
2022-04-06 13:30:59 +02:00
|
|
|
- git add -A
|
|
|
|
- git commit -m "latest change $(date -u)" || exit 0
|
|
|
|
- git push
|