openrefine-task-runner/Taskfile.yml

86 lines
3.2 KiB
YAML
Raw Normal View History

2021-02-20 00:22:12 +01:00
# https://github.com/opencultureconsulting/openrefine-tasks
version: '3'
includes:
2021-02-23 17:11:59 +01:00
example-doaj: example-doaj
example-duplicates: example-duplicates
example-powerhouse: example-powerhouse
2021-02-20 00:22:12 +01:00
# add your project here
silent: true
output: prefixed
tasks:
default:
desc: execute all projects in parallel
deps:
- task: example-doaj:refine
- task: example-duplicates:refine
- task: example-powerhouse:refine
# add your project here
cmds:
- task: check
install:
desc: (re)install OpenRefine and openrefine-client into subdirectory openrefine
cmds:
- | # delete existing install and recreate folder
2021-02-23 17:11:59 +01:00
rm -rf openrefine
mkdir -p openrefine
- > # download OpenRefine archive
wget --no-verbose -O openrefine.tar.gz
https://github.com/OpenRefine/OpenRefine/releases/download/3.4.1/openrefine-linux-3.4.1.tar.gz
- > # install OpenRefine into subdirectory openrefine
tar -xzf openrefine.tar.gz -C openrefine --strip 1
&& rm openrefine.tar.gz
- | # optimize OpenRefine for batch processing
sed -i 's/cd `dirname $0`/cd "$(dirname "$0")"/' "openrefine/refine" # fix path issue in OpenRefine startup file
sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' "openrefine/refine.ini" # do not try to open OpenRefine in browser
sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1440/' "openrefine/refine.ini" # set autosave period from 5 minutes to 25 hours
- > # download openrefine-client into subdirectory openrefine
wget --no-verbose -O openrefine/client
https://github.com/opencultureconsulting/openrefine-client/releases/download/v0.3.10/openrefine-client_0-3-10_linux
&& chmod +x openrefine/client
2021-02-20 00:22:12 +01:00
start:
2021-02-23 17:11:59 +01:00
dir: ./{{.PROJECT}}
2021-02-20 00:22:12 +01:00
cmds:
- | # check install and delete any temporary OpenRefine files
2021-02-23 17:11:59 +01:00
if [ ! -f "../openrefine/refine" ]; then
2021-02-20 00:22:12 +01:00
echo 1>&2 "OpenRefine missing; try task install"; exit 1
fi
rm -rf ./*.project* workspace.json openrefine.log
2021-02-23 17:11:59 +01:00
- > # launch OpenRefine with specific data directory and redirect its output to a log file
../openrefine/refine -v warn -p {{.PORT}} -m {{.RAM}}
-d ../{{.PROJECT}}
>> openrefine.log 2>&1 &
2021-02-20 00:22:12 +01:00
- | # wait until OpenRefine API is available
timeout 30s bash -c "until
wget -q -O - http://localhost:{{.PORT}} | cat | grep -q -o OpenRefine
do sleep 1
done"
stop:
2021-02-23 17:11:59 +01:00
dir: ./{{.PROJECT}}
2021-02-20 00:22:12 +01:00
cmds:
- | # shut down OpenRefine
PID=$(lsof -t -i:{{.PORT}})
kill $PID
while ps -p $PID > /dev/null; do sleep 1; done
2021-02-23 17:11:59 +01:00
- > # archive the OpenRefine project and delete temporary files
tar cfz
{{.PROJECT}}.openrefine.tar.gz
-C $(grep -l {{.PROJECT}} *.project/metadata.json | cut -d '/' -f 1)
.
&& rm -rf ./*.project* workspace.json
2021-02-20 00:22:12 +01:00
check:
desc: check OpenRefine log for any warnings and exit on error
dir: ./{{.PROJECT}}
cmds:
- | # find log file(s) and check for "exception" or "error"
if grep -i 'exception\|error' $(find . -name openrefine.log); then
echo 1>&2 "log contains warnings!"; exit 1
fi