reduce complexity

This commit is contained in:
Felix Lohmeier 2022-04-06 13:30:59 +02:00
parent 1341e1b45c
commit 8ee91ee84f
11 changed files with 147 additions and 404 deletions

View File

@ -1,86 +0,0 @@
name: run all tasks
on:
workflow_dispatch: # allows you to run this workflow manually from the Actions tab
jobs:
example-doaj: # available as environment variable $GITHUB_JOB and context ${{ github.job }}
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2
- name: install go-task 3.10.0
run: |
wget --no-verbose -O task.tar.gz https://github.com/go-task/task/releases/download/v3.10.0/task_linux_amd64.tar.gz
sudo tar -xzf task.tar.gz -C /usr/local/bin task && rm task.tar.gz
- name: install OpenRefine and openrefine-client
run: task install
- name: run task ${{ github.job }}
run: task "$GITHUB_JOB:main"
- uses: actions/upload-artifact@v2
if: always()
with:
name: ${{ github.job }}
path: ${{ github.job }}
retention-days: 5
- name: commit and push if output changed # has nothing to do currently because of .gitignore
run: |-
git config user.name "Automated"
git config user.email "actions@users.noreply.github.com"
git add -A
git status
git commit -m "latest change: $(date -u)" || exit 0
git push
example-duplicates: # available as environment variable $GITHUB_JOB and context ${{ github.job }}
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2
- name: install go-task 3.10.0
run: |
wget --no-verbose -O task.tar.gz https://github.com/go-task/task/releases/download/v3.10.0/task_linux_amd64.tar.gz
sudo tar -xzf task.tar.gz -C /usr/local/bin task && rm task.tar.gz
- name: install OpenRefine and openrefine-client
run: task install
- name: run task ${{ github.job }}
run: task "$GITHUB_JOB:main"
- uses: actions/upload-artifact@v2
if: always()
with:
name: ${{ github.job }}
path: ${{ github.job }}
retention-days: 5
- name: commit and push if output changed # has nothing to do currently because of .gitignore
run: |-
git config user.name "Automated"
git config user.email "actions@users.noreply.github.com"
git add -A
git status
git commit -m "latest change: $(date -u)" || exit 0
git push
example-powerhouse: # available as environment variable $GITHUB_JOB and context ${{ github.job }}
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2
- name: install go-task 3.10.0
run: |
wget --no-verbose -O task.tar.gz https://github.com/go-task/task/releases/download/v3.10.0/task_linux_amd64.tar.gz
sudo tar -xzf task.tar.gz -C /usr/local/bin task && rm task.tar.gz
- name: install OpenRefine and openrefine-client
run: task install
- name: run task ${{ github.job }}
run: task "$GITHUB_JOB:main"
- uses: actions/upload-artifact@v2
if: always()
with:
name: ${{ github.job }}
path: ${{ github.job }}
retention-days: 5
- name: commit and push if output changed # has nothing to do currently because of .gitignore
run: |-
git config user.name "Automated"
git config user.email "actions@users.noreply.github.com"
git add -A
git status
git commit -m "latest change: $(date -u)" || exit 0
git push

44
.github/workflows/example.yml vendored Normal file
View File

@ -0,0 +1,44 @@
name: example
on:
workflow_dispatch: # allows you to run this workflow manually from the Actions tab
jobs:
main:
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2
- name: install go-task 3.10.0
run: |
wget --no-verbose -O task.tar.gz https://github.com/go-task/task/releases/download/v3.10.0/task_linux_amd64.tar.gz
sudo tar -xzf task.tar.gz -C /usr/local/bin task && rm task.tar.gz
- name: install OpenRefine and openrefine-client
run: task install
- name: start OpenRefine
run: task start
- name: import
run: task import
- name: transform
run: task transform
- name: export
run: task export
- name: print stats
if: always()
run: task stats
- name: check log file
if: always()
run: task check
- name: stop OpenRefine
if: always()
run: task stop
- uses: actions/upload-artifact@v2
if: always()
with:
name: OpenRefine project and logfile
path: .openrefine/data
retention-days: 7
- name: git commit and push
run: |
git config user.name "Automated"
git config user.email "actions@users.noreply.github.com"
task git

7
.gitignore vendored
View File

@ -1,9 +1,2 @@
.task .task
.openrefine .openrefine
*/output
*/*.log
*/*.openrefine.tar.gz
example-doaj/input
example-doaj/config
example-powerhouse/input
example-powerhouse/config

View File

@ -1,21 +1,20 @@
# OpenRefine Task Runner (💎+🤖) # OpenRefine Task Runner (💎+🤖)
[![Codacy Badge](https://app.codacy.com/project/badge/Grade/888dbf663fdd409e8d8fcf8472114194)](https://www.codacy.com/gh/opencultureconsulting/openrefine-task-runner/dashboard) [![Binder](https://notebooks.gesis.org/binder/badge_logo.svg)](https://notebooks.gesis.org/binder/v2/gh/opencultureconsulting/openrefine-task-runner/main?urlpath=lab/tree/demo.ipynb) [![Codacy Badge](https://app.codacy.com/project/badge/Grade/888dbf663fdd409e8d8fcf8472114194)](https://www.codacy.com/gh/opencultureconsulting/openrefine-task-runner/dashboard) [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/felixlohmeier/openrefineder/master)
Templates for OpenRefine batch processing (import, transform, export) using the task runner [go-task](https://github.com/go-task/task) and the [openrefine-client](https://github.com/opencultureconsulting/openrefine-client) to control OpenRefine via [its HTTP API](https://docs.openrefine.org/technical-reference/openrefine-api). Templates for OpenRefine batch processing (import, transform, export) using the task runner [go-task](https://github.com/go-task/task) and the [openrefine-client](https://github.com/opencultureconsulting/openrefine-client) to control OpenRefine via [its HTTP API](https://docs.openrefine.org/technical-reference/openrefine-api).
## Features ## Features
* run tasks in parallel
* basic error handling by monitoring the OpenRefine server log * basic error handling by monitoring the OpenRefine server log
* dedicated OpenRefine instances for each task (your existing OpenRefine data will not be touched) * dedicated OpenRefine instance with temporary workspace (your existing OpenRefine data will not be touched)
* prevent unnecessary work by fingerprinting generated files and their sources * prevent unnecessary work by fingerprinting generated files and their sources
* the [openrefine-client](https://github.com/opencultureconsulting/openrefine-client) used here supports many core features of OpenRefine: * the [openrefine-client](https://github.com/opencultureconsulting/openrefine-client) used here supports many core features of OpenRefine:
* import CSV, TSV, line-based TXT, fixed-width TXT, JSON or XML (and specify input options) * import CSV, TSV, line-based TXT, fixed-width TXT, JSON or XML (and specify input options)
* apply [undo/redo history](https://docs.openrefine.org/manual/running/#reusing-operations) from given JSON file(s) * apply [undo/redo history](https://docs.openrefine.org/manual/running/#reusing-operations) from given JSON file(s)
* export to CSV, TSV, HTML, XLS, XLSX, ODS * export to CSV, TSV, HTML, XLS, XLSX, ODS
* [templating export](https://github.com/opencultureconsulting/openrefine-client#templating) to additional formats like JSON or XML * [templating export](https://github.com/opencultureconsulting/openrefine-client#templating) to additional formats like JSON or XML
* works with OpenRefine 2.7, 2.8, 3.0, 3.1, 3.2, 3.3, 3.4, 3.4.1 and 3.5 * works with OpenRefine 2.7, 2.8, 3.0, 3.1, 3.2, 3.3, 3.4 and 3.5
* tasks are easy to extend with additional commands (e.g. to download input data or validate results) * tasks are easy to extend with additional commands (e.g. to download input data or validate results)
## Typical workflow ## Typical workflow
@ -26,10 +25,10 @@ Templates for OpenRefine batch processing (import, transform, export) using the
**Possible automation benefits:** **Possible automation benefits:**
* When you receive updated data (in the same structure), you just need to drop the file and start the task like this: * When you receive updated data (in the same structure), you just need to drop the input file and start the task like this:
```sh ```sh
task example-doaj task
``` ```
* The entire data processing (including options during import) becomes reproducible. The task configuration file can also be used for documentation through source code comments. * The entire data processing (including options during import) becomes reproducible. The task configuration file can also be used for documentation through source code comments.
@ -38,18 +37,17 @@ Templates for OpenRefine batch processing (import, transform, export) using the
## Requirements ## Requirements
* GNU/Linux (tested with Fedora 32) * GNU/Linux (tested with Fedora 34)
* JAVA 8+ (for OpenRefine) * JAVA 8+ (for OpenRefine)
## Demo via binder ## Demo via binder
[![Binder](https://notebooks.gesis.org/binder/badge_logo.svg)](https://notebooks.gesis.org/binder/v2/gh/opencultureconsulting/openrefine-task-runner/main?urlpath=lab/tree/demo.ipynb) [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/felixlohmeier/openrefineder/master)
- free to use on-demand server with Jupyterlab and Bash Kernel - free to use on-demand server with Jupyterlab and Bash Kernel
- OpenRefine, openrefine-client and go-task [preinstalled](binder/postBuild) - OpenRefine, openrefine-client and go-task [preinstalled](binder/postBuild)
- no registration needed, will start within a few minutes - no registration needed, will start within a few minutes
- [restricted](https://notebooks.gesis.org/faq/) to 4 GB RAM and server will be deleted after 10 minutes of inactivity - [restricted](https://mybinder.readthedocs.io/en/latest/about/about.html#how-much-memory-am-i-given-when-using-binder) to 2 GB RAM and server will be deleted after 10 minutes of inactivity
- service is provided by GESIS and is intended for use by social scientists
## Install ## Install
@ -60,7 +58,7 @@ Templates for OpenRefine batch processing (import, transform, export) using the
cd openrefine-task-runner cd openrefine-task-runner
``` ```
2. Install [Task 3.10.0](https://github.com/go-task/task/releases/tag/v3.10.0) 2. Install [Task 3.10.0](https://github.com/go-task/task/releases/tag/v3.10.0)+
a) RPM-based (Fedora, CentOS, SLES, etc.) a) RPM-based (Fedora, CentOS, SLES, etc.)
@ -84,34 +82,28 @@ Templates for OpenRefine batch processing (import, transform, export) using the
## Usage ## Usage
* Run all tasks in parallel * Run default task (start, import, transform, export, stats, check, kill and cleanup)
```sh ```sh
task task default
``` ```
* Run a specific task * Override settings with environment variables
```sh ```sh
task example-duplicates:main OPENREFINE_MEMORY=2000M OPENREFINE_PORT=3334 task default
```
* Run some tasks in parallel
```sh
task --parallel example-duplicates:main example-doaj:main
``` ```
* Force run a task even when the task is up-to-date * Force run a task even when the task is up-to-date
```sh ```sh
task example-duplicates:main --force task default --force
``` ```
* Dry-run in verbose mode for debugging * Dry-run in verbose mode for debugging
```sh ```sh
task example-duplicates:main --dry --verbose --force task default --dry --verbose --force
``` ```
* List available tasks * List available tasks
@ -120,17 +112,9 @@ Templates for OpenRefine batch processing (import, transform, export) using the
task --list task --list
``` ```
### How to develop your own tasks ### Examples
(first draft, will be elaborated later) * [noah-biejournals](https://github.com/opencultureconsulting/noah-biejournals): Harvesting des Zeitschriftenservers BieJournals der UB Bielefeld und Transformation in METS/MODS für das Portal noah.nrw
1. create a new folder
2. copy an example Taskfile.yml
3. provide input data in subdirectory input
4. provide OpenRefine transformation history files in subdirectory config
5. add commands to specific Taskfile (check openrefine-client help screen for available options: `openrefine/client --help`)
6. add project to general Taskfile
7. check memory load and increase RAM if needed
### Getting help ### Getting help

View File

@ -1,102 +1,109 @@
# https://github.com/opencultureconsulting/openrefine-task-runner
version: '3' version: '3'
includes:
example-doaj: example-doaj
example-duplicates: example-duplicates
example-powerhouse: example-powerhouse
# add the directory name of your project here
silent: true silent: true
output: prefixed
env: env:
OPENREFINE: OPENREFINE_MEMORY: 5120M
sh: readlink -m .openrefine/refine OPENREFINE_PORT: 3333
CLIENT: OPENREFINE_APPDIR:
sh: readlink -m .openrefine/client sh: readlink -m .openrefine
OPENREFINE_TMPDIR:
sh: mktemp -d
tasks: tasks:
default: default:
desc: execute all projects in parallel desc: run tasks start, import, transform, export, stats, check, kill and cleanup
deps:
- task: example-doaj:refine
- task: example-duplicates:refine
- task: example-powerhouse:refine
# add the directory name of your project here
cmds: cmds:
- task: check - defer: { task: cleanup } # will run even when one of the following commands fail
- task: start
- defer: { task: kill } # will run before cleanup
- defer: { task: check } # will run before kill
- defer: { task: stats } # will run before check
- task: import
- task: transform
- task: export
sources:
- Taskfile.yml
- input/**
- config/**
generates:
- output/**
install: install:
desc: (re)install OpenRefine and openrefine-client into subdirectory .openrefine desc: install OpenRefine and openrefine-client into subdirectory ${OPENREFINE_APPDIR}
cmds: cmds:
- | # delete existing install and recreate folder - mkdir -p "${OPENREFINE_APPDIR}"
rm -rf .openrefine - | # install OpenRefine into subdirectory ${OPENREFINE_APPDIR}
mkdir -p .openrefine wget --no-verbose -O openrefine.tar.gz https://github.com/OpenRefine/OpenRefine/releases/download/3.5.2/openrefine-linux-3.5.2.tar.gz
- > # download OpenRefine archive tar -xzf openrefine.tar.gz -C "${OPENREFINE_APPDIR}" --strip 1 && rm openrefine.tar.gz
wget --no-verbose -O openrefine.tar.gz
https://github.com/OpenRefine/OpenRefine/releases/download/3.5.2/openrefine-linux-3.5.2.tar.gz
- | # install OpenRefine into subdirectory .openrefine
tar -xzf openrefine.tar.gz -C .openrefine --strip 1
rm openrefine.tar.gz
- | # optimize OpenRefine for batch processing - | # optimize OpenRefine for batch processing
sed -i 's/cd `dirname $0`/cd "$(dirname "$0")"/' ".openrefine/refine" # fix path issue in OpenRefine startup file sed -i 's/cd `dirname $0`/cd "$(dirname "$0")"/' "${OPENREFINE_APPDIR}/refine" # fix path issue in OpenRefine startup file
sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' ".openrefine/refine.ini" # do not try to open OpenRefine in browser sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' "${OPENREFINE_APPDIR}/refine.ini" # do not try to open OpenRefine in browser
sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1440/' ".openrefine/refine.ini" # set autosave period from 5 minutes to 25 hours sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1440/' "${OPENREFINE_APPDIR}/refine.ini" # set autosave period from 5 minutes to 25 hours
- > # download openrefine-client into subdirectory .openrefine - | # install openrefine-client into subdirectory ${OPENREFINE_APPDIR}
wget --no-verbose -O .openrefine/client wget --no-verbose -O "${OPENREFINE_APPDIR}/client" https://github.com/opencultureconsulting/openrefine-client/releases/download/v0.3.10/openrefine-client_0-3-10_linux
https://github.com/opencultureconsulting/openrefine-client/releases/download/v0.3.10/openrefine-client_0-3-10_linux chmod +x "${OPENREFINE_APPDIR}/client"
- chmod +x .openrefine/client # make client executable
start: start:
dir: ./{{.DIR}} - | # requirement OpenRefine
cmds: if [ ! -f "${OPENREFINE_APPDIR}/refine" ]; then
- | # verify that OpenRefine is installed
if [ ! -f "$OPENREFINE" ]; then
echo 1>&2 "OpenRefine missing; try task install"; exit 1 echo 1>&2 "OpenRefine missing; try task install"; exit 1
fi fi
- | # delete temporary files and log file of previous run - | # launch OpenRefine with specific data directory and redirect its output to a log file
rm -rf ./*.project* workspace.json "${OPENREFINE_APPDIR}/refine" -v warn -p "$OPENREFINE_PORT" -m "$OPENREFINE_MEMORY" -d "${OPENREFINE_TMPDIR}" >> "${OPENREFINE_TMPDIR}/log.txt" 2>&1 &
rm -rf "{{.PROJECT}}.log"
- > # launch OpenRefine with specific data directory and redirect its output to a log file
"$OPENREFINE" -v warn -p {{.PORT}} -m {{.RAM}}
-d ../{{.DIR}}
>> "{{.PROJECT}}.log" 2>&1 &
- | # wait until OpenRefine API is available - | # wait until OpenRefine API is available
timeout 30s bash -c "until timeout 30s bash -c "until wget -q -O - -o /dev/null http://localhost:${OPENREFINE_PORT} | cat | grep -q -o OpenRefine; do sleep 1; done"
wget -q -O - -o /dev/null http://localhost:{{.PORT}} | cat | grep -q -o OpenRefine
do sleep 1
done"
stop: import:
dir: ./{{.DIR}} - | # import (requires absolute path)
cmds: "${OPENREFINE_APPDIR}/client" \
- | # shut down OpenRefine gracefully --create "$(readlink -m input/duplicates.csv)" \
PID=$(lsof -t -i:{{.PORT}}) --projectName myproject \
kill $PID > >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1
while ps -p $PID > /dev/null; do sleep 1; done
- > # archive the OpenRefine project
tar cfz
"{{.PROJECT}}.openrefine.tar.gz"
-C $(grep -l "{{.PROJECT}}" *.project/metadata.json | cut -d '/' -f 1)
.
- rm -rf ./*.project* workspace.json # delete temporary files
kill: transform:
dir: ./{{.DIR}} - | # apply undo/redo history
cmds: for f in config/*.json; do
- | # shut down OpenRefine immediately to save time and disk space "${OPENREFINE_APPDIR}/client" myproject --apply "$f" \
PID=$(lsof -t -i:{{.PORT}}) > >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1
kill -9 $PID done
while ps -p $PID > /dev/null; do sleep 1; done
- rm -rf ./*.project* workspace.json # delete temporary files export:
- mkdir -p output
- | # export to TSV
"${OPENREFINE_APPDIR}/client" myproject \
--output "$(readlink -m output/deduped.tsv)" \
> >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1
stats:
- | # print RAM and CPU usage
PID="$(lsof -t -i:${OPENREFINE_PORT})"
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" \
> >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" \
> >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1
check: check:
desc: check OpenRefine log for any warnings and exit on error - | # check log file for any warnings
dir: ./{{.DIR}} if grep -i 'exception\|error' "${OPENREFINE_TMPDIR}/log.txt"
cmds: then echo 1>&2 "log contains warnings!"; echo; cat "${OPENREFINE_TMPDIR}/log.txt"; exit 1
- | # find log file(s) and check for "exception" or "error"
if grep -i 'exception\|error' $(find . -name '*.log'); then
echo 1>&2 "log contains warnings!"; exit 1
fi fi
stop:
- | # shut down OpenRefine gracefully
PID="$(lsof -t -i:${OPENREFINE_PORT})"
kill $PID; while ps -p $PID > /dev/null; do sleep 1; done
kill:
- | # shut down OpenRefine immediately to save time
PID="$(lsof -t -i:${OPENREFINE_PORT})"
kill -9 $PID
cleanup: rm -rf "${OPENREFINE_TMPDIR}"
git:
desc: commit and push if something changed
cmds:
- git add -A
- git commit -m "latest change $(date -u)" || exit 0
- git push

View File

@ -1 +0,0 @@
{"cells":[{"metadata":{},"cell_type":"markdown","source":"## Run all tasks in parallel"},{"metadata":{"trusted":true},"cell_type":"code","source":"task","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"## Run a specific task"},{"metadata":{"trusted":true},"cell_type":"code","source":"task example-duplicates:main","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"## Run some tasks in parallel"},{"metadata":{"trusted":true},"cell_type":"code","source":"task --parallel example-duplicates:main example-doaj:main","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"## Force run a task even when the task is up-to-date"},{"metadata":{"trusted":true},"cell_type":"code","source":"task example-duplicates:main --force","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"## Dry-run in verbose mode for debugging"},{"metadata":{"trusted":true},"cell_type":"code","source":"task example-duplicates:main --dry --verbose --force","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"## List available tasks"},{"metadata":{"trusted":true},"cell_type":"code","source":"task --list","execution_count":null,"outputs":[]}],"metadata":{"kernelspec":{"name":"bash","display_name":"Bash","language":"bash"},"language_info":{"name":"bash","codemirror_mode":"shell","mimetype":"text/x-sh","file_extension":".sh"}},"nbformat":4,"nbformat_minor":5}

View File

@ -1,70 +0,0 @@
version: '3'
tasks:
main:
desc: Library Carpentry Lesson covering DOAJ
vars:
DIR: '{{splitList ":" .TASK | first}}' # results in the task namespace, which is identical to the directory name
cmds:
- task: refine
- task: :check # check OpenRefine log for any warnings and exit on error
vars: {DIR: '{{.DIR}}'}
refine:
dir: ./{{.DIR}}
vars:
DIR: '{{splitList ":" .TASK | first}}'
PROJECT: doaj
PORT: 3334 # assign a different port for each project
RAM: 2048M # maximum RAM for OpenRefine java heap space
LOG: '>(tee -a "{{.PROJECT}}.log") 2>&1' # be careful when making changes here, as the path to the log file should match the server log (see main task "start")
deps:
- task: download # will be executed each run independent of up-to-date check
cmds:
- task: :start # launch OpenRefine
vars: {DIR: '{{.DIR}}', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
- > # import file
"$CLIENT" -P {{.PORT}}
--create "$(readlink -m input/doaj-article-sample.csv)"
--projectName "{{.PROJECT}}"
> {{.LOG}}
- > # apply transformation rules
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
--apply config/doaj-openrefine.json
> {{.LOG}}
- mkdir -p output
- > # export to file
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
--output "$(readlink -m output/doaj-results.tsv)"
> {{.LOG}}
- | # print allocated system resources
PID="$(lsof -t -i:{{.PORT}})"
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" > {{.LOG}}
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" > {{.LOG}}
- task: :stop # shut down OpenRefine and archive the OpenRefine project
vars: {DIR: '{{.DIR}}', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
sources:
- Taskfile.yml
- input/**
- config/**
generates:
- ./{{.PROJECT}}.openrefine.tar.gz
- output/**
ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141
download:
dir: ./{{.DIR}}
vars:
DIR: '{{splitList ":" .TASK | first}}'
cmds:
- mkdir -p input config
- > # Download input
wget --no-verbose -O input/doaj-article-sample.csv
https://github.com/felixlohmeier/openrefine-kimws2019/raw/master/doaj-article-sample.csv
- > # Download config
wget --no-verbose -O config/doaj-openrefine.json
https://github.com/felixlohmeier/openrefine-kimws2019/raw/master/doaj-openrefine.json
default: # enable standalone execution (running `task` in project directory)
cmds:
- DIR="${PWD##*/}:main" && cd .. && task "$DIR"

View File

@ -1,56 +0,0 @@
version: '3'
tasks:
main:
desc: Removing duplicates in a very small test dataset
vars:
DIR: '{{splitList ":" .TASK | first}}' # results in the task namespace, which is identical to the directory name
cmds:
- task: refine
- task: :check # check OpenRefine log for any warnings and exit on error
vars: {DIR: '{{.DIR}}'}
refine:
dir: ./{{.DIR}}
vars:
DIR: '{{splitList ":" .TASK | first}}'
PROJECT: duplicates
PORT: 3335 # assign a different port for each project
RAM: 2048M # maximum RAM for OpenRefine java heap space
LOG: '>(tee -a "{{.PROJECT}}.log") 2>&1' # be careful when making changes here, as the path to the log file should match the server log (see main task "start")
cmds:
- task: :start # launch OpenRefine
vars: {DIR: '{{.DIR}}', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
- > # import file
"$CLIENT" -P {{.PORT}}
--create "$(readlink -m input/duplicates.csv)"
--encoding UTF-8
--projectName "{{.PROJECT}}"
> {{.LOG}}
- > # apply transformation rules
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
--apply config/duplicates-deletion.json
> {{.LOG}}
- mkdir -p output
- > # export to file
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
--output "$(readlink -m output/deduped.xls)"
> {{.LOG}}
- | # print allocated system resources
PID="$(lsof -t -i:{{.PORT}})"
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" > {{.LOG}}
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" > {{.LOG}}
- task: :stop # shut down OpenRefine and archive the OpenRefine project
vars: {DIR: '{{.DIR}}', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
sources:
- Taskfile.yml
- input/**
- config/**
generates:
- ./{{.PROJECT}}.openrefine.tar.gz
- output/**
ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141
default: # enable standalone execution (running `task` in project directory)
cmds:
- DIR="${PWD##*/}:main" && cd .. && task "$DIR"

View File

@ -1,72 +0,0 @@
version: '3'
tasks:
main:
desc: Powerhouse Museum Tutorial
vars:
DIR: '{{splitList ":" .TASK | first}}' # results in the task namespace, which is identical to the directory name
cmds:
- task: refine
- task: :check # check OpenRefine log for any warnings and exit on error
vars: {DIR: '{{.DIR}}'}
refine:
dir: ./{{.DIR}}
vars:
DIR: '{{splitList ":" .TASK | first}}'
PROJECT: phm
PORT: 3336 # assign a different port for each project
RAM: 2048M # maximum RAM for OpenRefine java heap space
LOG: '>(tee -a "{{.PROJECT}}.log") 2>&1' # be careful when making changes here, as the path to the log file should match the server log (see main task "start")
deps:
- task: download # will be executed each run independent of up-to-date check
cmds:
- task: :start # launch OpenRefine
vars: {DIR: '{{.DIR}}', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
- > # import file
"$CLIENT" -P {{.PORT}}
--create "$(readlink -m input/phm-collection.tsv)"
--processQuotes false
--guessCellValueTypes true
--projectName "{{.PROJECT}}"
> {{.LOG}}
- > # apply transformation rules
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
--apply config/phm-transform.json
> {{.LOG}}
- mkdir -p output
- > # export to file
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
--output "$(readlink -m output/phm-results.tsv)"
> {{.LOG}}
- | # print allocated system resources
PID="$(lsof -t -i:{{.PORT}})"
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" > {{.LOG}}
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" > {{.LOG}}
- task: :stop # shut down OpenRefine and archive the OpenRefine project
vars: {DIR: '{{.DIR}}', PORT: '{{.PORT}}', PROJECT: '{{.PROJECT}}'}
sources:
- Taskfile.yml
- input/**
- config/**
generates:
- ./{{.PROJECT}}.openrefine.tar.gz
- output/**
ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141
download:
dir: ./{{.DIR}}
vars:
DIR: '{{splitList ":" .TASK | first}}'
cmds:
- mkdir -p input config
- > # Download input
wget --no-verbose -O input/phm-collection.tsv
https://github.com/opencultureconsulting/openrefine-batch/raw/master/examples/powerhouse-museum/input/phm-collection.tsv
- > # Download config
wget --no-verbose -O config/phm-transform.json
https://github.com/opencultureconsulting/openrefine-batch/raw/master/examples/powerhouse-museum/config/phm-transform.json
default: # enable standalone execution (running `task` in project directory)
cmds:
- DIR="${PWD##*/}:main" && cd .. && task "$DIR"