openrefine-task-runner/Taskfile.yml

110 lines
4.3 KiB
YAML
Raw Normal View History

2021-02-20 00:22:12 +01:00
version: '3'
silent: true
env:
2022-04-06 13:30:59 +02:00
OPENREFINE_MEMORY: 5120M
OPENREFINE_PORT: 3333
OPENREFINE_APPDIR:
sh: readlink -m .openrefine
OPENREFINE_TMPDIR:
sh: mktemp -d
2021-02-20 00:22:12 +01:00
tasks:
default:
2022-04-06 13:30:59 +02:00
desc: run tasks start, import, transform, export, stats, check, kill and cleanup
2021-02-20 00:22:12 +01:00
cmds:
2022-04-06 13:30:59 +02:00
- defer: { task: cleanup } # will run even when one of the following commands fail
- task: start
- defer: { task: kill } # will run before cleanup
- defer: { task: check } # will run before kill
- defer: { task: stats } # will run before check
- task: import
- task: transform
- task: export
sources:
- Taskfile.yml
- input/**
- config/**
generates:
- output/**
2021-02-20 00:22:12 +01:00
install:
2022-04-06 13:30:59 +02:00
desc: install OpenRefine and openrefine-client into subdirectory ${OPENREFINE_APPDIR}
2021-02-20 00:22:12 +01:00
cmds:
2022-04-06 13:30:59 +02:00
- mkdir -p "${OPENREFINE_APPDIR}"
- | # install OpenRefine into subdirectory ${OPENREFINE_APPDIR}
wget --no-verbose -O openrefine.tar.gz https://github.com/OpenRefine/OpenRefine/releases/download/3.5.2/openrefine-linux-3.5.2.tar.gz
tar -xzf openrefine.tar.gz -C "${OPENREFINE_APPDIR}" --strip 1 && rm openrefine.tar.gz
2021-02-23 17:11:59 +01:00
- | # optimize OpenRefine for batch processing
2022-04-06 13:30:59 +02:00
sed -i 's/cd `dirname $0`/cd "$(dirname "$0")"/' "${OPENREFINE_APPDIR}/refine" # fix path issue in OpenRefine startup file
sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' "${OPENREFINE_APPDIR}/refine.ini" # do not try to open OpenRefine in browser
sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1440/' "${OPENREFINE_APPDIR}/refine.ini" # set autosave period from 5 minutes to 25 hours
- | # install openrefine-client into subdirectory ${OPENREFINE_APPDIR}
wget --no-verbose -O "${OPENREFINE_APPDIR}/client" https://github.com/opencultureconsulting/openrefine-client/releases/download/v0.3.10/openrefine-client_0-3-10_linux
chmod +x "${OPENREFINE_APPDIR}/client"
2021-02-20 00:22:12 +01:00
start:
2022-04-06 13:30:59 +02:00
- | # requirement OpenRefine
if [ ! -f "${OPENREFINE_APPDIR}/refine" ]; then
2021-02-20 00:22:12 +01:00
echo 1>&2 "OpenRefine missing; try task install"; exit 1
fi
2022-04-06 13:30:59 +02:00
- | # launch OpenRefine with specific data directory and redirect its output to a log file
"${OPENREFINE_APPDIR}/refine" -v warn -p "$OPENREFINE_PORT" -m "$OPENREFINE_MEMORY" -d "${OPENREFINE_TMPDIR}" >> "${OPENREFINE_TMPDIR}/log.txt" 2>&1 &
2021-02-20 00:22:12 +01:00
- | # wait until OpenRefine API is available
2022-04-06 13:30:59 +02:00
timeout 30s bash -c "until wget -q -O - -o /dev/null http://localhost:${OPENREFINE_PORT} | cat | grep -q -o OpenRefine; do sleep 1; done"
import:
- | # import (requires absolute path)
"${OPENREFINE_APPDIR}/client" \
--create "$(readlink -m input/duplicates.csv)" \
--projectName myproject \
> >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1
transform:
- | # apply undo/redo history
for f in config/*.json; do
"${OPENREFINE_APPDIR}/client" myproject --apply "$f" \
> >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1
done
export:
- mkdir -p output
- | # export to TSV
"${OPENREFINE_APPDIR}/client" myproject \
--output "$(readlink -m output/deduped.tsv)" \
> >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1
stats:
- | # print RAM and CPU usage
PID="$(lsof -t -i:${OPENREFINE_PORT})"
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" \
> >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" \
> >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1
check:
- | # check log file for any warnings
if grep -i 'exception\|error' "${OPENREFINE_TMPDIR}/log.txt"
then echo 1>&2 "log contains warnings!"; echo; cat "${OPENREFINE_TMPDIR}/log.txt"; exit 1
fi
2021-02-20 00:22:12 +01:00
stop:
2021-02-25 12:59:45 +01:00
- | # shut down OpenRefine gracefully
2022-04-06 13:30:59 +02:00
PID="$(lsof -t -i:${OPENREFINE_PORT})"
kill $PID; while ps -p $PID > /dev/null; do sleep 1; done
2021-02-20 00:22:12 +01:00
kill:
2022-04-06 13:30:59 +02:00
- | # shut down OpenRefine immediately to save time
PID="$(lsof -t -i:${OPENREFINE_PORT})"
kill -9 $PID
2022-04-06 13:30:59 +02:00
cleanup: rm -rf "${OPENREFINE_TMPDIR}"
git:
desc: commit and push if something changed
2021-02-20 00:22:12 +01:00
cmds:
2022-04-06 13:30:59 +02:00
- git add -A
- git commit -m "latest change $(date -u)" || exit 0
- git push