simplify even more
This commit is contained in:
parent
cfb37d72e6
commit
c0facb81e0
|
@ -1,4 +1,4 @@
|
|||
name: example
|
||||
name: default
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows you to run this workflow manually from the Actions tab
|
||||
|
@ -14,23 +14,8 @@ jobs:
|
|||
sudo tar -xzf task.tar.gz -C /usr/local/bin task && rm task.tar.gz
|
||||
- name: install OpenRefine and openrefine-client
|
||||
run: task install
|
||||
- name: start OpenRefine
|
||||
run: task start
|
||||
- name: import
|
||||
run: task import
|
||||
- name: transform
|
||||
run: task transform
|
||||
- name: export
|
||||
run: task export
|
||||
- name: print stats
|
||||
if: always()
|
||||
run: task stats
|
||||
- name: check log file
|
||||
if: always()
|
||||
run: task check
|
||||
- name: stop OpenRefine
|
||||
if: always()
|
||||
run: task stop
|
||||
- name: run OpenRefine batch processing
|
||||
run: task default
|
||||
- uses: actions/upload-artifact@v2
|
||||
if: always()
|
||||
with:
|
18
README.md
18
README.md
|
@ -1,9 +1,11 @@
|
|||
# OpenRefine Task Runner (💎+🤖)
|
||||
|
||||
[![Codacy Badge](https://app.codacy.com/project/badge/Grade/888dbf663fdd409e8d8fcf8472114194)](https://www.codacy.com/gh/opencultureconsulting/openrefine-task-runner/dashboard) [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/felixlohmeier/openrefineder/master)
|
||||
[![Codacy Badge](https://app.codacy.com/project/badge/Grade/888dbf663fdd409e8d8fcf8472114194)](https://www.codacy.com/gh/opencultureconsulting/openrefine-task-runner/dashboard) [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/felixlohmeier/openrefine-task-runner/main)
|
||||
|
||||
Templates for OpenRefine batch processing (import, transform, export) using the task runner [go-task](https://github.com/go-task/task) and the [openrefine-client](https://github.com/opencultureconsulting/openrefine-client) to control OpenRefine via [its HTTP API](https://docs.openrefine.org/technical-reference/openrefine-api).
|
||||
|
||||
The workflow is defined in [Taskfile.yml](Taskfile.yml) and can be executed either locally (`task default`) or with [GitHub Actions](.github/workflows/default.yml).
|
||||
|
||||
## Features
|
||||
|
||||
* basic error handling by monitoring the OpenRefine server log
|
||||
|
@ -17,6 +19,11 @@ Templates for OpenRefine batch processing (import, transform, export) using the
|
|||
* works with OpenRefine 2.7, 2.8, 3.0, 3.1, 3.2, 3.3, 3.4 and 3.5
|
||||
* tasks are easy to extend with additional commands (e.g. to download input data or validate results)
|
||||
|
||||
## Requirements
|
||||
|
||||
* GNU/Linux (tested with Fedora 34)
|
||||
* JAVA 8+ (for OpenRefine)
|
||||
|
||||
## Typical workflow
|
||||
|
||||
**Step 1**: Do some experiments with your data (or parts of it) in the graphical user interface of OpenRefine. If you are fine with all transformation rules, [extract the json code](http://kb.refinepro.com/2012/06/google-refine-json-and-my-notepad-or.html) and save it as file (e.g. dedup.json).
|
||||
|
@ -35,14 +42,9 @@ Templates for OpenRefine batch processing (import, transform, export) using the
|
|||
|
||||
* Metadata experts can use OpenRefine's graphical interface and IT staff can incorporate the created transformation rules into regular data processing flows.
|
||||
|
||||
## Requirements
|
||||
|
||||
* GNU/Linux (tested with Fedora 34)
|
||||
* JAVA 8+ (for OpenRefine)
|
||||
|
||||
## Demo via binder
|
||||
|
||||
[![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/felixlohmeier/openrefineder/master)
|
||||
[![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/felixlohmeier/openrefine-task-runner/main)
|
||||
|
||||
- free to use on-demand server with Jupyterlab and Bash Kernel
|
||||
- OpenRefine, openrefine-client and go-task [preinstalled](binder/postBuild)
|
||||
|
@ -82,7 +84,7 @@ Templates for OpenRefine batch processing (import, transform, export) using the
|
|||
|
||||
## Usage
|
||||
|
||||
* Run default task (start, import, transform, export, stats, check, kill and cleanup)
|
||||
* Run workflow
|
||||
|
||||
```sh
|
||||
task default
|
||||
|
|
106
Taskfile.yml
106
Taskfile.yml
|
@ -8,26 +8,57 @@ env:
|
|||
OPENREFINE_APPDIR:
|
||||
sh: readlink -m .openrefine
|
||||
OPENREFINE_TMPDIR:
|
||||
sh: mktemp -d
|
||||
sh: mkdir -p .openrefine/tmp; readlink -m .openrefine/tmp
|
||||
|
||||
tasks:
|
||||
default:
|
||||
desc: run tasks start, import, transform, export, stats, check, kill and cleanup
|
||||
desc: run workflow in batch mode
|
||||
cmds:
|
||||
- defer: { task: cleanup } # will run even when one of the following commands fail
|
||||
- defer: { task: stop } # will always be executed last
|
||||
- task: start
|
||||
- defer: { task: kill } # will run before cleanup
|
||||
- defer: { task: check } # will run before kill
|
||||
- defer: { task: stats } # will run before check
|
||||
- task: import
|
||||
- task: transform
|
||||
- task: export
|
||||
- task: example
|
||||
sources:
|
||||
- Taskfile.yml
|
||||
- input/**
|
||||
- config/**
|
||||
generates:
|
||||
- output/**
|
||||
preconditions:
|
||||
- sh: test -f "${OPENREFINE_APPDIR}/refine"
|
||||
msg: "OpenRefine missing; try task install"
|
||||
|
||||
start:
|
||||
- echo "start OpenRefine with max. $OPENREFINE_MEMORY on port $OPENREFINE_PORT..."
|
||||
- | # launch OpenRefine with specific data directory and redirect its output to a log file
|
||||
"${OPENREFINE_APPDIR}/refine" -v warn -p "$OPENREFINE_PORT" -m "$OPENREFINE_MEMORY" -d "${OPENREFINE_TMPDIR}" > "${OPENREFINE_TMPDIR}/log.txt" 2>&1 &
|
||||
- | # wait until OpenRefine API is available
|
||||
timeout 30s bash -c "until wget -q -O - -o /dev/null http://localhost:${OPENREFINE_PORT} | cat | grep -q -o OpenRefine; do sleep 1; done"
|
||||
|
||||
example:
|
||||
- | # import (requires absolute path)
|
||||
"${OPENREFINE_APPDIR}/client" \
|
||||
--create "$(readlink -m input/duplicates.csv)" \
|
||||
--projectName example
|
||||
- | # apply undo/redo history
|
||||
for f in config/*.json; do
|
||||
"${OPENREFINE_APPDIR}/client" example --apply "$f"
|
||||
done
|
||||
- | # export to TSV
|
||||
mkdir -p output
|
||||
"${OPENREFINE_APPDIR}/client" example \
|
||||
--output output/deduped.tsv
|
||||
|
||||
stop:
|
||||
- defer: rm -rf "${OPENREFINE_TMPDIR}"
|
||||
- | # print stats and kill OpenRefine immediately
|
||||
PID="$(lsof -t -i:${OPENREFINE_PORT})"
|
||||
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM"
|
||||
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time"
|
||||
kill -9 $PID
|
||||
- | # check log file for any warnings
|
||||
if grep -i 'exception\|error' "${OPENREFINE_TMPDIR}/log.txt"
|
||||
then echo 1>&2 "log contains warnings!"; echo; cat "${OPENREFINE_TMPDIR}/log.txt"; exit 1
|
||||
fi
|
||||
|
||||
install:
|
||||
desc: install OpenRefine and openrefine-client into subdirectory ${OPENREFINE_APPDIR}
|
||||
|
@ -44,63 +75,6 @@ tasks:
|
|||
wget --no-verbose -O "${OPENREFINE_APPDIR}/client" https://github.com/opencultureconsulting/openrefine-client/releases/download/v0.3.10/openrefine-client_0-3-10_linux
|
||||
chmod +x "${OPENREFINE_APPDIR}/client"
|
||||
|
||||
start:
|
||||
- | # requirement OpenRefine
|
||||
if [ ! -f "${OPENREFINE_APPDIR}/refine" ]; then
|
||||
echo 1>&2 "OpenRefine missing; try task install"; exit 1
|
||||
fi
|
||||
- | # launch OpenRefine with specific data directory and redirect its output to a log file
|
||||
"${OPENREFINE_APPDIR}/refine" -v warn -p "$OPENREFINE_PORT" -m "$OPENREFINE_MEMORY" -d "${OPENREFINE_TMPDIR}" >> "${OPENREFINE_TMPDIR}/log.txt" 2>&1 &
|
||||
- | # wait until OpenRefine API is available
|
||||
timeout 30s bash -c "until wget -q -O - -o /dev/null http://localhost:${OPENREFINE_PORT} | cat | grep -q -o OpenRefine; do sleep 1; done"
|
||||
|
||||
import:
|
||||
- | # import (requires absolute path)
|
||||
"${OPENREFINE_APPDIR}/client" \
|
||||
--create "$(readlink -m input/duplicates.csv)" \
|
||||
--projectName myproject \
|
||||
> >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1
|
||||
|
||||
transform:
|
||||
- | # apply undo/redo history
|
||||
for f in config/*.json; do
|
||||
"${OPENREFINE_APPDIR}/client" myproject --apply "$f" \
|
||||
> >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1
|
||||
done
|
||||
|
||||
export:
|
||||
- mkdir -p output
|
||||
- | # export to TSV
|
||||
"${OPENREFINE_APPDIR}/client" myproject \
|
||||
--output "$(readlink -m output/deduped.tsv)" \
|
||||
> >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1
|
||||
|
||||
stats:
|
||||
- | # print RAM and CPU usage
|
||||
PID="$(lsof -t -i:${OPENREFINE_PORT})"
|
||||
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" \
|
||||
> >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1
|
||||
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" \
|
||||
> >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1
|
||||
|
||||
check:
|
||||
- | # check log file for any warnings
|
||||
if grep -i 'exception\|error' "${OPENREFINE_TMPDIR}/log.txt"
|
||||
then echo 1>&2 "log contains warnings!"; echo; cat "${OPENREFINE_TMPDIR}/log.txt"; exit 1
|
||||
fi
|
||||
|
||||
stop:
|
||||
- | # shut down OpenRefine gracefully
|
||||
PID="$(lsof -t -i:${OPENREFINE_PORT})"
|
||||
kill $PID; while ps -p $PID > /dev/null; do sleep 1; done
|
||||
|
||||
kill:
|
||||
- | # shut down OpenRefine immediately to save time
|
||||
PID="$(lsof -t -i:${OPENREFINE_PORT})"
|
||||
kill -9 $PID
|
||||
|
||||
cleanup: rm -rf "${OPENREFINE_TMPDIR}"
|
||||
|
||||
git:
|
||||
desc: commit and push if something changed
|
||||
cmds:
|
||||
|
|
Loading…
Reference in New Issue