2021-02-20 00:22:12 +01:00
version : '3'
tasks :
main :
desc : Removing duplicates in a very small test dataset
2021-02-24 16:18:11 +01:00
vars :
2021-02-25 12:33:27 +01:00
DIR : '{{splitList ":" .TASK | first}}' # results in the task namespace, which is identical to the directory name
2021-02-20 00:22:12 +01:00
cmds :
- task : refine
- task : : check # check OpenRefine log for any warnings and exit on error
2021-02-23 22:45:03 +01:00
vars : {DIR : '{{.DIR}}' }
2021-02-20 00:22:12 +01:00
refine :
2021-02-23 22:45:03 +01:00
dir : ./{{.DIR}}
2021-02-20 00:22:12 +01:00
vars :
2021-02-24 16:18:11 +01:00
DIR : '{{splitList ":" .TASK | first}}'
PROJECT : duplicates
2021-02-23 22:45:03 +01:00
PORT : 3335 # assign a different port for each project
2021-02-20 00:22:12 +01:00
RAM : 2048M # maximum RAM for OpenRefine java heap space
2021-02-25 12:33:27 +01:00
LOG : '>(tee -a "{{.PROJECT}}.log") 2>&1' # be careful when making changes here, as the path to the log file should match the server log (see main task "start")
2021-02-23 17:11:59 +01:00
cmds :
- task : : start # launch OpenRefine
2021-02-24 23:00:41 +01:00
vars : {DIR : '{{.DIR}}' , PROJECT : '{{.PROJECT}}' , PORT : '{{.PORT}}' , RAM : '{{.RAM}}' }
2021-02-23 17:11:59 +01:00
- > # import file
2021-02-23 22:45:03 +01:00
"$CLIENT" -P {{.PORT}}
2021-02-23 17:11:59 +01:00
--create "$(readlink -m input/duplicates.csv)"
--encoding UTF-8
2021-02-24 21:06:12 +01:00
--projectName "{{.PROJECT}}"
2021-02-24 19:13:41 +01:00
> {{.LOG}}
2021-02-23 17:11:59 +01:00
- > # apply transformation rules
2021-02-24 21:06:12 +01:00
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
2021-02-23 17:11:59 +01:00
--apply config/duplicates-deletion.json
2021-02-24 19:13:41 +01:00
> {{.LOG}}
2021-02-23 17:11:59 +01:00
- > # export to file
mkdir -p output &&
2021-02-24 21:06:12 +01:00
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
2021-02-23 17:11:59 +01:00
--output "$(readlink -m output/deduped.xls)"
2021-02-24 19:13:41 +01:00
> {{.LOG}}
2021-02-23 17:11:59 +01:00
- | # print allocated system resources
PID="$(lsof -t -i:{{.PORT}})"
2021-02-24 19:13:41 +01:00
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" > {{.LOG}}
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" > {{.LOG}}
2021-02-23 17:11:59 +01:00
- task : : stop # shut down OpenRefine and archive the OpenRefine project
2021-02-24 23:00:41 +01:00
vars : {DIR : '{{.DIR}}' , PROJECT : '{{.PROJECT}}' , PORT : '{{.PORT}}' }
2021-02-20 00:22:12 +01:00
sources :
2021-02-24 16:18:11 +01:00
- Taskfile.yml
2021-02-20 00:22:12 +01:00
- input/**
- config/**
generates :
2021-02-24 23:00:41 +01:00
- ./{{.PROJECT}}.openrefine.tar.gz
2021-02-23 17:11:59 +01:00
- output/**
2021-02-25 12:33:27 +01:00
ignore_error : true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141
2021-02-20 00:22:12 +01:00
default : # enable standalone execution (running `task` in project directory)
cmds :
2021-02-23 22:45:03 +01:00
- DIR="${PWD##*/}:main" && cd .. && task "$DIR"