simplify even more

This commit is contained in:
Felix Lohmeier 2022-04-06 20:43:30 +02:00
parent cfb37d72e6
commit c0facb81e0
3 changed files with 54 additions and 93 deletions

View File

@ -1,4 +1,4 @@
name: example
name: default
on:
workflow_dispatch: # allows you to run this workflow manually from the Actions tab
@ -14,23 +14,8 @@ jobs:
sudo tar -xzf task.tar.gz -C /usr/local/bin task && rm task.tar.gz
- name: install OpenRefine and openrefine-client
run: task install
- name: start OpenRefine
run: task start
- name: import
run: task import
- name: transform
run: task transform
- name: export
run: task export
- name: print stats
if: always()
run: task stats
- name: check log file
if: always()
run: task check
- name: stop OpenRefine
if: always()
run: task stop
- name: run OpenRefine batch processing
run: task default
- uses: actions/upload-artifact@v2
if: always()
with:

View File

@ -1,8 +1,10 @@
# OpenRefine Task Runner (💎+🤖)
[![Codacy Badge](https://app.codacy.com/project/badge/Grade/888dbf663fdd409e8d8fcf8472114194)](https://www.codacy.com/gh/opencultureconsulting/openrefine-task-runner/dashboard) [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/felixlohmeier/openrefineder/master)
[![Codacy Badge](https://app.codacy.com/project/badge/Grade/888dbf663fdd409e8d8fcf8472114194)](https://www.codacy.com/gh/opencultureconsulting/openrefine-task-runner/dashboard) [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/felixlohmeier/openrefine-task-runner/main)
Templates for OpenRefine batch processing (import, transform, export) using the task runner [go-task](https://github.com/go-task/task) and the [openrefine-client](https://github.com/opencultureconsulting/openrefine-client) to control OpenRefine via [its HTTP API](https://docs.openrefine.org/technical-reference/openrefine-api).
Templates for OpenRefine batch processing (import, transform, export) using the task runner [go-task](https://github.com/go-task/task) and the [openrefine-client](https://github.com/opencultureconsulting/openrefine-client) to control OpenRefine via [its HTTP API](https://docs.openrefine.org/technical-reference/openrefine-api).
The workflow is defined in [Taskfile.yml](Taskfile.yml) and can be executed either locally (`task default`) or with [GitHub Actions](.github/workflows/default.yml).
## Features
@ -17,6 +19,11 @@ Templates for OpenRefine batch processing (import, transform, export) using the
* works with OpenRefine 2.7, 2.8, 3.0, 3.1, 3.2, 3.3, 3.4 and 3.5
* tasks are easy to extend with additional commands (e.g. to download input data or validate results)
## Requirements
* GNU/Linux (tested with Fedora 34)
* JAVA 8+ (for OpenRefine)
## Typical workflow
**Step 1**: Do some experiments with your data (or parts of it) in the graphical user interface of OpenRefine. If you are fine with all transformation rules, [extract the json code](http://kb.refinepro.com/2012/06/google-refine-json-and-my-notepad-or.html) and save it as file (e.g. dedup.json).
@ -35,14 +42,9 @@ Templates for OpenRefine batch processing (import, transform, export) using the
* Metadata experts can use OpenRefine's graphical interface and IT staff can incorporate the created transformation rules into regular data processing flows.
## Requirements
* GNU/Linux (tested with Fedora 34)
* JAVA 8+ (for OpenRefine)
## Demo via binder
[![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/felixlohmeier/openrefineder/master)
[![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/felixlohmeier/openrefine-task-runner/main)
- free to use on-demand server with Jupyterlab and Bash Kernel
- OpenRefine, openrefine-client and go-task [preinstalled](binder/postBuild)
@ -82,7 +84,7 @@ Templates for OpenRefine batch processing (import, transform, export) using the
## Usage
* Run default task (start, import, transform, export, stats, check, kill and cleanup)
* Run workflow
```sh
task default

View File

@ -8,26 +8,57 @@ env:
OPENREFINE_APPDIR:
sh: readlink -m .openrefine
OPENREFINE_TMPDIR:
sh: mktemp -d
sh: mkdir -p .openrefine/tmp; readlink -m .openrefine/tmp
tasks:
default:
desc: run tasks start, import, transform, export, stats, check, kill and cleanup
desc: run workflow in batch mode
cmds:
- defer: { task: cleanup } # will run even when one of the following commands fail
- defer: { task: stop } # will always be executed last
- task: start
- defer: { task: kill } # will run before cleanup
- defer: { task: check } # will run before kill
- defer: { task: stats } # will run before check
- task: import
- task: transform
- task: export
- task: example
sources:
- Taskfile.yml
- input/**
- config/**
generates:
- output/**
preconditions:
- sh: test -f "${OPENREFINE_APPDIR}/refine"
msg: "OpenRefine missing; try task install"
start:
- echo "start OpenRefine with max. $OPENREFINE_MEMORY on port $OPENREFINE_PORT..."
- | # launch OpenRefine with specific data directory and redirect its output to a log file
"${OPENREFINE_APPDIR}/refine" -v warn -p "$OPENREFINE_PORT" -m "$OPENREFINE_MEMORY" -d "${OPENREFINE_TMPDIR}" > "${OPENREFINE_TMPDIR}/log.txt" 2>&1 &
- | # wait until OpenRefine API is available
timeout 30s bash -c "until wget -q -O - -o /dev/null http://localhost:${OPENREFINE_PORT} | cat | grep -q -o OpenRefine; do sleep 1; done"
example:
- | # import (requires absolute path)
"${OPENREFINE_APPDIR}/client" \
--create "$(readlink -m input/duplicates.csv)" \
--projectName example
- | # apply undo/redo history
for f in config/*.json; do
"${OPENREFINE_APPDIR}/client" example --apply "$f"
done
- | # export to TSV
mkdir -p output
"${OPENREFINE_APPDIR}/client" example \
--output output/deduped.tsv
stop:
- defer: rm -rf "${OPENREFINE_TMPDIR}"
- | # print stats and kill OpenRefine immediately
PID="$(lsof -t -i:${OPENREFINE_PORT})"
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM"
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time"
kill -9 $PID
- | # check log file for any warnings
if grep -i 'exception\|error' "${OPENREFINE_TMPDIR}/log.txt"
then echo 1>&2 "log contains warnings!"; echo; cat "${OPENREFINE_TMPDIR}/log.txt"; exit 1
fi
install:
desc: install OpenRefine and openrefine-client into subdirectory ${OPENREFINE_APPDIR}
@ -44,63 +75,6 @@ tasks:
wget --no-verbose -O "${OPENREFINE_APPDIR}/client" https://github.com/opencultureconsulting/openrefine-client/releases/download/v0.3.10/openrefine-client_0-3-10_linux
chmod +x "${OPENREFINE_APPDIR}/client"
start:
- | # requirement OpenRefine
if [ ! -f "${OPENREFINE_APPDIR}/refine" ]; then
echo 1>&2 "OpenRefine missing; try task install"; exit 1
fi
- | # launch OpenRefine with specific data directory and redirect its output to a log file
"${OPENREFINE_APPDIR}/refine" -v warn -p "$OPENREFINE_PORT" -m "$OPENREFINE_MEMORY" -d "${OPENREFINE_TMPDIR}" >> "${OPENREFINE_TMPDIR}/log.txt" 2>&1 &
- | # wait until OpenRefine API is available
timeout 30s bash -c "until wget -q -O - -o /dev/null http://localhost:${OPENREFINE_PORT} | cat | grep -q -o OpenRefine; do sleep 1; done"
import:
- | # import (requires absolute path)
"${OPENREFINE_APPDIR}/client" \
--create "$(readlink -m input/duplicates.csv)" \
--projectName myproject \
> >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1
transform:
- | # apply undo/redo history
for f in config/*.json; do
"${OPENREFINE_APPDIR}/client" myproject --apply "$f" \
> >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1
done
export:
- mkdir -p output
- | # export to TSV
"${OPENREFINE_APPDIR}/client" myproject \
--output "$(readlink -m output/deduped.tsv)" \
> >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1
stats:
- | # print RAM and CPU usage
PID="$(lsof -t -i:${OPENREFINE_PORT})"
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" \
> >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" \
> >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1
check:
- | # check log file for any warnings
if grep -i 'exception\|error' "${OPENREFINE_TMPDIR}/log.txt"
then echo 1>&2 "log contains warnings!"; echo; cat "${OPENREFINE_TMPDIR}/log.txt"; exit 1
fi
stop:
- | # shut down OpenRefine gracefully
PID="$(lsof -t -i:${OPENREFINE_PORT})"
kill $PID; while ps -p $PID > /dev/null; do sleep 1; done
kill:
- | # shut down OpenRefine immediately to save time
PID="$(lsof -t -i:${OPENREFINE_PORT})"
kill -9 $PID
cleanup: rm -rf "${OPENREFINE_TMPDIR}"
git:
desc: commit and push if something changed
cmds: