openrefine-task-runner/example-duplicates/Taskfile.yml

58 lines
2.0 KiB
YAML
Raw Normal View History

2021-02-20 00:22:12 +01:00
version: '3'
tasks:
main:
desc: Removing duplicates in a very small test dataset
2021-02-24 16:18:11 +01:00
vars:
DIR: '{{splitList ":" .TASK | first}}'
2021-02-20 00:22:12 +01:00
cmds:
- task: refine
- task: :check # check OpenRefine log for any warnings and exit on error
vars: {DIR: '{{.DIR}}'}
2021-02-20 00:22:12 +01:00
refine:
dir: ./{{.DIR}}
2021-02-20 00:22:12 +01:00
vars:
2021-02-24 16:18:11 +01:00
DIR: '{{splitList ":" .TASK | first}}'
PROJECT: duplicates
PORT: 3335 # assign a different port for each project
2021-02-20 00:22:12 +01:00
RAM: 2048M # maximum RAM for OpenRefine java heap space
LOG: '>(tee -a "{{.PROJECT}}.log") 2>&1'
2021-02-23 17:11:59 +01:00
cmds:
- task: :start # launch OpenRefine
vars: {DIR: '{{.DIR}}', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
2021-02-23 17:11:59 +01:00
- > # import file
"$CLIENT" -P {{.PORT}}
2021-02-23 17:11:59 +01:00
--create "$(readlink -m input/duplicates.csv)"
--encoding UTF-8
--projectName "{{.PROJECT}}"
2021-02-24 19:13:41 +01:00
> {{.LOG}}
2021-02-23 17:11:59 +01:00
- > # apply transformation rules
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
2021-02-23 17:11:59 +01:00
--apply config/duplicates-deletion.json
2021-02-24 19:13:41 +01:00
> {{.LOG}}
2021-02-23 17:11:59 +01:00
- > # export to file
mkdir -p output &&
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
2021-02-23 17:11:59 +01:00
--output "$(readlink -m output/deduped.xls)"
2021-02-24 19:13:41 +01:00
> {{.LOG}}
2021-02-23 17:11:59 +01:00
- | # print allocated system resources
PID="$(lsof -t -i:{{.PORT}})"
2021-02-24 19:13:41 +01:00
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" > {{.LOG}}
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" > {{.LOG}}
2021-02-23 17:11:59 +01:00
- task: :stop # shut down OpenRefine and archive the OpenRefine project
vars: {DIR: '{{.DIR}}', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
2021-02-20 00:22:12 +01:00
sources:
2021-02-24 16:18:11 +01:00
- Taskfile.yml
2021-02-20 00:22:12 +01:00
- input/**
- config/**
generates:
- ./{{.PROJECT}}.openrefine.tar.gz
2021-02-23 17:11:59 +01:00
- output/**
ignore_error: true # workaround to avoid an orphaned Java process on error
# https://github.com/go-task/task/issues/141
2021-02-20 00:22:12 +01:00
default: # enable standalone execution (running `task` in project directory)
cmds:
- DIR="${PWD##*/}:main" && cd .. && task "$DIR"