version: '3'

tasks:
  main:
    desc: Removing duplicates in a very small test dataset
    cmds:
      - task: refine
      - task: :check # check OpenRefine log for any warnings and exit on error
        vars: {PROJECT: '{{splitList ":" .TASK | first}}'}

  refine:
    dir: ./{{.PROJECT}}
    vars:
      PORT: 3334 # assign a different port for each project
      RAM: 2048M # maximum RAM for OpenRefine java heap space
      PROJECT: '{{splitList ":" .TASK | first}}'
    cmds:
      - task: :start # launch OpenRefine
        vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
      - > # import file
        ../openrefine/client -P {{.PORT}}
        --create "$(readlink -m input/duplicates.csv)"
        --encoding UTF-8
        --projectName {{.PROJECT}}
        > >(tee -a openrefine.log) 2>&1
      - > # apply transformation rules
        ../openrefine/client -P {{.PORT}} {{.PROJECT}}
        --apply config/duplicates-deletion.json
        > >(tee -a openrefine.log) 2>&1
      - > # export to file
        mkdir -p output &&
        ../openrefine/client -P {{.PORT}} {{.PROJECT}}
        --output "$(readlink -m output/deduped.xls)"
        > >(tee -a openrefine.log) 2>&1
      - | # print allocated system resources
        PID="$(lsof -t -i:{{.PORT}})"
        echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" \
          > >(tee -a openrefine.log)
        echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" \
          > >(tee -a openrefine.log)
      - task: :stop # shut down OpenRefine and archive the OpenRefine project
        vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
    sources:
      - input/**
      - config/**
    generates:
      - openrefine.log
      - ./{{.PROJECT}}.openrefine.tar.gz
      - output/**
    ignore_error: true # workaround to avoid an orphaned Java process on error
                       # https://github.com/go-task/task/issues/141

  default: # enable standalone execution (running `task` in project directory)
    cmds:
      - PROJECT="${PWD##*/}:main" && cd .. && task "$PROJECT"