openrefine-task-runner/example-duplicates/Taskfile.yml

71 lines
2.2 KiB
YAML

version: '3'
tasks:
main:
desc: Removing duplicates in a very small test dataset
cmds:
- task: refine
- task: :check # check OpenRefine log for any warnings and exit on error
vars: {PROJECT: '{{splitList ":" .TASK | first}}'}
refine:
vars:
PORT: 3334 # assign a different port for each project
RAM: 2048M # maximum RAM for OpenRefine java heap space
PROJECT: '{{splitList ":" .TASK | first}}'
cmds: # tasks prepended with ":" are defined in Taskfile.yml
- task: :start
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
- task: import
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
- task: apply
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
- task: export
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
- task: stats
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
- task: :stop
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
sources:
- input/**
- config/**
generates:
- output/openrefine.log
- output/{{.PROJECT}}.openrefine.tar.gz
ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141
import:
dir: input
cmds:
- | # import file
../../openrefine/client -P {{.PORT}} \
--create duplicates.csv \
--encoding UTF-8 \
--projectName {{.PROJECT}}
ignore_error: true # workaround
apply:
dir: config
cmds:
- | # apply transformation rules
../../openrefine/client -P {{.PORT}} {{.PROJECT}} \
--apply duplicates-deletion.json
ignore_error: true # workaround
export:
dir: output
cmds:
- | # export to file; use readlink to log full path to output file
../../openrefine/client -P {{.PORT}} {{.PROJECT}} \
--output "$(readlink -m deduped.xls)"
ignore_error: true # workaround
stats:
cmds:
- ps -o start,etime,%mem,%cpu,rss -p $(lsof -t -i:{{.PORT}}) # print allocated system resources
ignore_error: true # workaround
default: # enable standalone execution (running `task` in project directory)
cmds:
- PROJECT="${PWD##*/}:main" && cd .. && task "$PROJECT"