version: '3' silent: true env: OPENREFINE_MEMORY: 5120M OPENREFINE_PORT: 3333 OPENREFINE_APPDIR: sh: readlink -m .openrefine OPENREFINE_TMPDIR: sh: mkdir -p .openrefine/tmp; readlink -m .openrefine/tmp tasks: default: desc: run workflow in batch mode cmds: - defer: { task: cleanup } # will always be executed last - task: start - task: example - task: check sources: - Taskfile.yml - input/** - config/** generates: - output/** preconditions: - sh: test -f "${OPENREFINE_APPDIR}/refine" msg: "OpenRefine missing; try task install" start: - echo "start OpenRefine with max. $OPENREFINE_MEMORY on port $OPENREFINE_PORT..." - | # launch OpenRefine with specific data directory and redirect its output to a log file "${OPENREFINE_APPDIR}/refine" -v warn -p "$OPENREFINE_PORT" -m "$OPENREFINE_MEMORY" -d "${OPENREFINE_TMPDIR}" > "${OPENREFINE_TMPDIR}/log.txt" 2>&1 & - | # wait until OpenRefine API is available timeout 30s bash -c "until wget -q -O - -o /dev/null http://localhost:${OPENREFINE_PORT} | cat | grep -q -o OpenRefine; do sleep 1; done" example: - | # import (requires absolute path) "${OPENREFINE_APPDIR}/client" \ --create "$(readlink -m input/duplicates.csv)" \ --projectName example - | # apply undo/redo history for f in config/*.json; do "${OPENREFINE_APPDIR}/client" example --apply "$f" done - | # export to TSV mkdir -p output "${OPENREFINE_APPDIR}/client" example \ --output output/deduped.tsv check: - | # print stats PID="$(lsof -t -i:${OPENREFINE_PORT})" echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" - | # check log file for any warnings if grep -i 'exception\|error' "${OPENREFINE_TMPDIR}/log.txt" then echo 1>&2 "log contains warnings!"; echo; cat "${OPENREFINE_TMPDIR}/log.txt"; exit 1 fi cleanup: - | # kill OpenRefine immediately PID="$(lsof -t -i:${OPENREFINE_PORT})" kill -9 $PID - | # delete temporary files rm -rf "${OPENREFINE_TMPDIR}" install: desc: install OpenRefine and openrefine-client into subdirectory ${OPENREFINE_APPDIR} cmds: - mkdir -p "${OPENREFINE_APPDIR}" - | # install OpenRefine into subdirectory ${OPENREFINE_APPDIR} wget --no-verbose -O openrefine.tar.gz https://github.com/OpenRefine/OpenRefine/releases/download/3.5.2/openrefine-linux-3.5.2.tar.gz tar -xzf openrefine.tar.gz -C "${OPENREFINE_APPDIR}" --strip 1 && rm openrefine.tar.gz - | # optimize OpenRefine for batch processing sed -i 's/cd `dirname $0`/cd "$(dirname "$0")"/' "${OPENREFINE_APPDIR}/refine" # fix path issue in OpenRefine startup file sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' "${OPENREFINE_APPDIR}/refine.ini" # do not try to open OpenRefine in browser sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1440/' "${OPENREFINE_APPDIR}/refine.ini" # set autosave period from 5 minutes to 25 hours - | # install openrefine-client into subdirectory ${OPENREFINE_APPDIR} wget --no-verbose -O "${OPENREFINE_APPDIR}/client" https://github.com/opencultureconsulting/openrefine-client/releases/download/v0.3.10/openrefine-client_0-3-10_linux chmod +x "${OPENREFINE_APPDIR}/client" git: desc: commit and push if something changed cmds: - git add -A - git commit -m "latest change $(date -u)" || exit 0 - git push