openrefine-task-runner/example-doaj/Taskfile.yml

78 lines
2.6 KiB
YAML

version: '3'
tasks:
main:
desc: Library Carpentry Lesson covering DOAJ
cmds:
- task: refine
- task: :check # check OpenRefine log for any warnings and exit on error
vars: {PROJECT: '{{splitList ":" .TASK | first}}'}
refine:
vars:
PORT: 3335 # assign a different port for each project
RAM: 2048M # maximum RAM for OpenRefine java heap space
PROJECT: '{{splitList ":" .TASK | first}}'
deps: # will be executed each run independent of up-to-date check
- task: download
cmds: # tasks prepended with ":" are defined in Taskfile.yml
- task: :start
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
- task: import
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
- task: apply
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
- task: export
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
- task: stats
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
- task: :stop
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
sources:
- input/**
- config/**
generates:
- output/openrefine.log
- output/{{.PROJECT}}.openrefine.tar.gz
ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141
download:
cmds:
- mkdir -p input config
- wget --no-verbose -O input/doaj-article-sample.csv https://github.com/felixlohmeier/openrefine-kimws2019/raw/master/doaj-article-sample.csv
- wget --no-verbose -O config/doaj-openrefine.json https://github.com/felixlohmeier/openrefine-kimws2019/raw/master/doaj-openrefine.json
import:
dir: input
cmds:
- | # import file
../../openrefine/client -P {{.PORT}} \
--create doaj-article-sample.csv \
--projectName {{.PROJECT}}
ignore_error: true # workaround
apply:
dir: config
cmds:
- | # apply transformation rules
../../openrefine/client -P {{.PORT}} {{.PROJECT}} \
--apply doaj-openrefine.json
ignore_error: true # workaround
export:
dir: output
cmds:
- | # export to file; use readlink to log full path to output file
../../openrefine/client -P {{.PORT}} {{.PROJECT}} \
--output "$(readlink -m doaj-results.tsv)"
ignore_error: true # workaround
stats:
cmds:
- ps -o start,etime,%mem,%cpu,rss -p $(lsof -t -i:{{.PORT}}) # print allocated system resources
ignore_error: true # workaround
default: # enable standalone execution (running `task` in project directory)
cmds:
- PROJECT="${PWD##*/}:main" && cd .. && task "$PROJECT"