version: '3' tasks: main: desc: Removing duplicates in a very small test dataset vars: DIR: '{{splitList ":" .TASK | first}}' cmds: - task: refine - task: :check # check OpenRefine log for any warnings and exit on error vars: {DIR: '{{.DIR}}'} refine: dir: ./{{.DIR}} vars: DIR: '{{splitList ":" .TASK | first}}' PROJECT: duplicates PORT: 3335 # assign a different port for each project RAM: 2048M # maximum RAM for OpenRefine java heap space LOG: '>(tee -a "log/{{.PROJECT}}.log") 2>&1' cmds: - task: :start # launch OpenRefine vars: {DIR: '{{.DIR}}/log', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'} - > # import file "$CLIENT" -P {{.PORT}} --create "$(readlink -m input/duplicates.csv)" --encoding UTF-8 --projectName "{{.PROJECT}}" > {{.LOG}} - > # apply transformation rules "$CLIENT" -P {{.PORT}} "{{.PROJECT}}" --apply config/duplicates-deletion.json > {{.LOG}} - > # export to file mkdir -p output && "$CLIENT" -P {{.PORT}} "{{.PROJECT}}" --output "$(readlink -m output/deduped.xls)" > {{.LOG}} - | # print allocated system resources PID="$(lsof -t -i:{{.PORT}})" echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" > {{.LOG}} echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" > {{.LOG}} - task: :stop # shut down OpenRefine and archive the OpenRefine project vars: {DIR: '{{.DIR}}/log', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} sources: - Taskfile.yml - input/** - config/** generates: - log/{{.PROJECT}}.openrefine.tar.gz - output/** ignore_error: true # workaround to avoid an orphaned Java process on error # https://github.com/go-task/task/issues/141 default: # enable standalone execution (running `task` in project directory) cmds: - DIR="${PWD##*/}:main" && cd .. && task "$DIR"