version: '3' tasks: main: desc: Removing duplicates in a very small test dataset cmds: - task: refine - task: :check # check OpenRefine log for any warnings and exit on error vars: {PROJECT: '{{splitList ":" .TASK | first}}'} refine: dir: ./{{.PROJECT}} vars: PORT: 3334 # assign a different port for each project RAM: 2048M # maximum RAM for OpenRefine java heap space PROJECT: '{{splitList ":" .TASK | first}}' cmds: - task: :start # launch OpenRefine vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'} - > # import file ../openrefine/client -P {{.PORT}} --create "$(readlink -m input/duplicates.csv)" --encoding UTF-8 --projectName {{.PROJECT}} > >(tee -a openrefine.log) 2>&1 - > # apply transformation rules ../openrefine/client -P {{.PORT}} {{.PROJECT}} --apply config/duplicates-deletion.json > >(tee -a openrefine.log) 2>&1 - > # export to file mkdir -p output && ../openrefine/client -P {{.PORT}} {{.PROJECT}} --output "$(readlink -m output/deduped.xls)" > >(tee -a openrefine.log) 2>&1 - | # print allocated system resources PID="$(lsof -t -i:{{.PORT}})" echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" \ > >(tee -a openrefine.log) echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" \ > >(tee -a openrefine.log) - task: :stop # shut down OpenRefine and archive the OpenRefine project vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} sources: - input/** - config/** generates: - openrefine.log - ./{{.PROJECT}}.openrefine.tar.gz - output/** ignore_error: true # workaround to avoid an orphaned Java process on error # https://github.com/go-task/task/issues/141 default: # enable standalone execution (running `task` in project directory) cmds: - PROJECT="${PWD##*/}:main" && cd .. && task "$PROJECT"