diff --git a/.github/workflows/example.yml b/.github/workflows/default.yml similarity index 67% rename from .github/workflows/example.yml rename to .github/workflows/default.yml index 6fca952..b826963 100644 --- a/.github/workflows/example.yml +++ b/.github/workflows/default.yml @@ -1,4 +1,4 @@ -name: example +name: default on: workflow_dispatch: # allows you to run this workflow manually from the Actions tab @@ -14,23 +14,8 @@ jobs: sudo tar -xzf task.tar.gz -C /usr/local/bin task && rm task.tar.gz - name: install OpenRefine and openrefine-client run: task install - - name: start OpenRefine - run: task start - - name: import - run: task import - - name: transform - run: task transform - - name: export - run: task export - - name: print stats - if: always() - run: task stats - - name: check log file - if: always() - run: task check - - name: stop OpenRefine - if: always() - run: task stop + - name: run OpenRefine batch processing + run: task default - uses: actions/upload-artifact@v2 if: always() with: diff --git a/README.md b/README.md index 8e986a5..c0ecc7f 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,10 @@ # OpenRefine Task Runner (💎+🤖) -[![Codacy Badge](https://app.codacy.com/project/badge/Grade/888dbf663fdd409e8d8fcf8472114194)](https://www.codacy.com/gh/opencultureconsulting/openrefine-task-runner/dashboard) [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/felixlohmeier/openrefineder/master) +[![Codacy Badge](https://app.codacy.com/project/badge/Grade/888dbf663fdd409e8d8fcf8472114194)](https://www.codacy.com/gh/opencultureconsulting/openrefine-task-runner/dashboard) [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/felixlohmeier/openrefine-task-runner/main) -Templates for OpenRefine batch processing (import, transform, export) using the task runner [go-task](https://github.com/go-task/task) and the [openrefine-client](https://github.com/opencultureconsulting/openrefine-client) to control OpenRefine via [its HTTP API](https://docs.openrefine.org/technical-reference/openrefine-api). +Templates for OpenRefine batch processing (import, transform, export) using the task runner [go-task](https://github.com/go-task/task) and the [openrefine-client](https://github.com/opencultureconsulting/openrefine-client) to control OpenRefine via [its HTTP API](https://docs.openrefine.org/technical-reference/openrefine-api). + +The workflow is defined in [Taskfile.yml](Taskfile.yml) and can be executed either locally (`task default`) or with [GitHub Actions](.github/workflows/default.yml). ## Features @@ -17,6 +19,11 @@ Templates for OpenRefine batch processing (import, transform, export) using the * works with OpenRefine 2.7, 2.8, 3.0, 3.1, 3.2, 3.3, 3.4 and 3.5 * tasks are easy to extend with additional commands (e.g. to download input data or validate results) +## Requirements + +* GNU/Linux (tested with Fedora 34) +* JAVA 8+ (for OpenRefine) + ## Typical workflow **Step 1**: Do some experiments with your data (or parts of it) in the graphical user interface of OpenRefine. If you are fine with all transformation rules, [extract the json code](http://kb.refinepro.com/2012/06/google-refine-json-and-my-notepad-or.html) and save it as file (e.g. dedup.json). @@ -35,14 +42,9 @@ Templates for OpenRefine batch processing (import, transform, export) using the * Metadata experts can use OpenRefine's graphical interface and IT staff can incorporate the created transformation rules into regular data processing flows. -## Requirements - -* GNU/Linux (tested with Fedora 34) -* JAVA 8+ (for OpenRefine) - ## Demo via binder -[![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/felixlohmeier/openrefineder/master) +[![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/felixlohmeier/openrefine-task-runner/main) - free to use on-demand server with Jupyterlab and Bash Kernel - OpenRefine, openrefine-client and go-task [preinstalled](binder/postBuild) @@ -82,7 +84,7 @@ Templates for OpenRefine batch processing (import, transform, export) using the ## Usage -* Run default task (start, import, transform, export, stats, check, kill and cleanup) +* Run workflow ```sh task default diff --git a/Taskfile.yml b/Taskfile.yml index e34731c..0084232 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -8,26 +8,57 @@ env: OPENREFINE_APPDIR: sh: readlink -m .openrefine OPENREFINE_TMPDIR: - sh: mktemp -d + sh: mkdir -p .openrefine/tmp; readlink -m .openrefine/tmp tasks: default: - desc: run tasks start, import, transform, export, stats, check, kill and cleanup + desc: run workflow in batch mode cmds: - - defer: { task: cleanup } # will run even when one of the following commands fail + - defer: { task: stop } # will always be executed last - task: start - - defer: { task: kill } # will run before cleanup - - defer: { task: check } # will run before kill - - defer: { task: stats } # will run before check - - task: import - - task: transform - - task: export + - task: example sources: - Taskfile.yml - input/** - config/** generates: - output/** + preconditions: + - sh: test -f "${OPENREFINE_APPDIR}/refine" + msg: "OpenRefine missing; try task install" + + start: + - echo "start OpenRefine with max. $OPENREFINE_MEMORY on port $OPENREFINE_PORT..." + - | # launch OpenRefine with specific data directory and redirect its output to a log file + "${OPENREFINE_APPDIR}/refine" -v warn -p "$OPENREFINE_PORT" -m "$OPENREFINE_MEMORY" -d "${OPENREFINE_TMPDIR}" > "${OPENREFINE_TMPDIR}/log.txt" 2>&1 & + - | # wait until OpenRefine API is available + timeout 30s bash -c "until wget -q -O - -o /dev/null http://localhost:${OPENREFINE_PORT} | cat | grep -q -o OpenRefine; do sleep 1; done" + + example: + - | # import (requires absolute path) + "${OPENREFINE_APPDIR}/client" \ + --create "$(readlink -m input/duplicates.csv)" \ + --projectName example + - | # apply undo/redo history + for f in config/*.json; do + "${OPENREFINE_APPDIR}/client" example --apply "$f" + done + - | # export to TSV + mkdir -p output + "${OPENREFINE_APPDIR}/client" example \ + --output output/deduped.tsv + + stop: + - defer: rm -rf "${OPENREFINE_TMPDIR}" + - | # print stats and kill OpenRefine immediately + PID="$(lsof -t -i:${OPENREFINE_PORT})" + echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" + echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" + kill -9 $PID + - | # check log file for any warnings + if grep -i 'exception\|error' "${OPENREFINE_TMPDIR}/log.txt" + then echo 1>&2 "log contains warnings!"; echo; cat "${OPENREFINE_TMPDIR}/log.txt"; exit 1 + fi install: desc: install OpenRefine and openrefine-client into subdirectory ${OPENREFINE_APPDIR} @@ -44,63 +75,6 @@ tasks: wget --no-verbose -O "${OPENREFINE_APPDIR}/client" https://github.com/opencultureconsulting/openrefine-client/releases/download/v0.3.10/openrefine-client_0-3-10_linux chmod +x "${OPENREFINE_APPDIR}/client" - start: - - | # requirement OpenRefine - if [ ! -f "${OPENREFINE_APPDIR}/refine" ]; then - echo 1>&2 "OpenRefine missing; try task install"; exit 1 - fi - - | # launch OpenRefine with specific data directory and redirect its output to a log file - "${OPENREFINE_APPDIR}/refine" -v warn -p "$OPENREFINE_PORT" -m "$OPENREFINE_MEMORY" -d "${OPENREFINE_TMPDIR}" >> "${OPENREFINE_TMPDIR}/log.txt" 2>&1 & - - | # wait until OpenRefine API is available - timeout 30s bash -c "until wget -q -O - -o /dev/null http://localhost:${OPENREFINE_PORT} | cat | grep -q -o OpenRefine; do sleep 1; done" - - import: - - | # import (requires absolute path) - "${OPENREFINE_APPDIR}/client" \ - --create "$(readlink -m input/duplicates.csv)" \ - --projectName myproject \ - > >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1 - - transform: - - | # apply undo/redo history - for f in config/*.json; do - "${OPENREFINE_APPDIR}/client" myproject --apply "$f" \ - > >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1 - done - - export: - - mkdir -p output - - | # export to TSV - "${OPENREFINE_APPDIR}/client" myproject \ - --output "$(readlink -m output/deduped.tsv)" \ - > >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1 - - stats: - - | # print RAM and CPU usage - PID="$(lsof -t -i:${OPENREFINE_PORT})" - echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" \ - > >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1 - echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" \ - > >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1 - - check: - - | # check log file for any warnings - if grep -i 'exception\|error' "${OPENREFINE_TMPDIR}/log.txt" - then echo 1>&2 "log contains warnings!"; echo; cat "${OPENREFINE_TMPDIR}/log.txt"; exit 1 - fi - - stop: - - | # shut down OpenRefine gracefully - PID="$(lsof -t -i:${OPENREFINE_PORT})" - kill $PID; while ps -p $PID > /dev/null; do sleep 1; done - - kill: - - | # shut down OpenRefine immediately to save time - PID="$(lsof -t -i:${OPENREFINE_PORT})" - kill -9 $PID - - cleanup: rm -rf "${OPENREFINE_TMPDIR}" - git: desc: commit and push if something changed cmds: