From 8ee91ee84fc900f9965736616083a758a25b231e Mon Sep 17 00:00:00 2001 From: Felix Lohmeier Date: Wed, 6 Apr 2022 13:30:59 +0200 Subject: [PATCH] reduce complexity --- .github/workflows/all-tasks.yml | 86 --------- .github/workflows/example.yml | 44 +++++ .gitignore | 7 - README.md | 50 ++---- Taskfile.yml | 165 +++++++++--------- .../duplicates-deletion.json | 0 demo.ipynb | 1 - example-doaj/Taskfile.yml | 70 -------- example-duplicates/Taskfile.yml | 56 ------ example-powerhouse/Taskfile.yml | 72 -------- .../input => input}/duplicates.csv | 0 11 files changed, 147 insertions(+), 404 deletions(-) delete mode 100644 .github/workflows/all-tasks.yml create mode 100644 .github/workflows/example.yml rename {example-duplicates/config => config}/duplicates-deletion.json (100%) delete mode 100644 demo.ipynb delete mode 100644 example-doaj/Taskfile.yml delete mode 100644 example-duplicates/Taskfile.yml delete mode 100644 example-powerhouse/Taskfile.yml rename {example-duplicates/input => input}/duplicates.csv (100%) diff --git a/.github/workflows/all-tasks.yml b/.github/workflows/all-tasks.yml deleted file mode 100644 index b1cda7a..0000000 --- a/.github/workflows/all-tasks.yml +++ /dev/null @@ -1,86 +0,0 @@ -name: run all tasks - -on: - workflow_dispatch: # allows you to run this workflow manually from the Actions tab - -jobs: - example-doaj: # available as environment variable $GITHUB_JOB and context ${{ github.job }} - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v2 - - name: install go-task 3.10.0 - run: | - wget --no-verbose -O task.tar.gz https://github.com/go-task/task/releases/download/v3.10.0/task_linux_amd64.tar.gz - sudo tar -xzf task.tar.gz -C /usr/local/bin task && rm task.tar.gz - - name: install OpenRefine and openrefine-client - run: task install - - name: run task ${{ github.job }} - run: task "$GITHUB_JOB:main" - - uses: actions/upload-artifact@v2 - if: always() - with: - name: ${{ github.job }} - path: ${{ github.job }} - retention-days: 5 - - name: commit and push if output changed # has nothing to do currently because of .gitignore - run: |- - git config user.name "Automated" - git config user.email "actions@users.noreply.github.com" - git add -A - git status - git commit -m "latest change: $(date -u)" || exit 0 - git push - - example-duplicates: # available as environment variable $GITHUB_JOB and context ${{ github.job }} - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v2 - - name: install go-task 3.10.0 - run: | - wget --no-verbose -O task.tar.gz https://github.com/go-task/task/releases/download/v3.10.0/task_linux_amd64.tar.gz - sudo tar -xzf task.tar.gz -C /usr/local/bin task && rm task.tar.gz - - name: install OpenRefine and openrefine-client - run: task install - - name: run task ${{ github.job }} - run: task "$GITHUB_JOB:main" - - uses: actions/upload-artifact@v2 - if: always() - with: - name: ${{ github.job }} - path: ${{ github.job }} - retention-days: 5 - - name: commit and push if output changed # has nothing to do currently because of .gitignore - run: |- - git config user.name "Automated" - git config user.email "actions@users.noreply.github.com" - git add -A - git status - git commit -m "latest change: $(date -u)" || exit 0 - git push - - example-powerhouse: # available as environment variable $GITHUB_JOB and context ${{ github.job }} - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v2 - - name: install go-task 3.10.0 - run: | - wget --no-verbose -O task.tar.gz https://github.com/go-task/task/releases/download/v3.10.0/task_linux_amd64.tar.gz - sudo tar -xzf task.tar.gz -C /usr/local/bin task && rm task.tar.gz - - name: install OpenRefine and openrefine-client - run: task install - - name: run task ${{ github.job }} - run: task "$GITHUB_JOB:main" - - uses: actions/upload-artifact@v2 - if: always() - with: - name: ${{ github.job }} - path: ${{ github.job }} - retention-days: 5 - - name: commit and push if output changed # has nothing to do currently because of .gitignore - run: |- - git config user.name "Automated" - git config user.email "actions@users.noreply.github.com" - git add -A - git status - git commit -m "latest change: $(date -u)" || exit 0 - git push diff --git a/.github/workflows/example.yml b/.github/workflows/example.yml new file mode 100644 index 0000000..6fca952 --- /dev/null +++ b/.github/workflows/example.yml @@ -0,0 +1,44 @@ +name: example + +on: + workflow_dispatch: # allows you to run this workflow manually from the Actions tab + +jobs: + main: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v2 + - name: install go-task 3.10.0 + run: | + wget --no-verbose -O task.tar.gz https://github.com/go-task/task/releases/download/v3.10.0/task_linux_amd64.tar.gz + sudo tar -xzf task.tar.gz -C /usr/local/bin task && rm task.tar.gz + - name: install OpenRefine and openrefine-client + run: task install + - name: start OpenRefine + run: task start + - name: import + run: task import + - name: transform + run: task transform + - name: export + run: task export + - name: print stats + if: always() + run: task stats + - name: check log file + if: always() + run: task check + - name: stop OpenRefine + if: always() + run: task stop + - uses: actions/upload-artifact@v2 + if: always() + with: + name: OpenRefine project and logfile + path: .openrefine/data + retention-days: 7 + - name: git commit and push + run: | + git config user.name "Automated" + git config user.email "actions@users.noreply.github.com" + task git diff --git a/.gitignore b/.gitignore index 0e85580..7ccc4c0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,2 @@ .task .openrefine -*/output -*/*.log -*/*.openrefine.tar.gz -example-doaj/input -example-doaj/config -example-powerhouse/input -example-powerhouse/config diff --git a/README.md b/README.md index 3b0ca2d..8e986a5 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,20 @@ # OpenRefine Task Runner (💎+🤖) -[![Codacy Badge](https://app.codacy.com/project/badge/Grade/888dbf663fdd409e8d8fcf8472114194)](https://www.codacy.com/gh/opencultureconsulting/openrefine-task-runner/dashboard) [![Binder](https://notebooks.gesis.org/binder/badge_logo.svg)](https://notebooks.gesis.org/binder/v2/gh/opencultureconsulting/openrefine-task-runner/main?urlpath=lab/tree/demo.ipynb) +[![Codacy Badge](https://app.codacy.com/project/badge/Grade/888dbf663fdd409e8d8fcf8472114194)](https://www.codacy.com/gh/opencultureconsulting/openrefine-task-runner/dashboard) [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/felixlohmeier/openrefineder/master) Templates for OpenRefine batch processing (import, transform, export) using the task runner [go-task](https://github.com/go-task/task) and the [openrefine-client](https://github.com/opencultureconsulting/openrefine-client) to control OpenRefine via [its HTTP API](https://docs.openrefine.org/technical-reference/openrefine-api). ## Features -* run tasks in parallel * basic error handling by monitoring the OpenRefine server log -* dedicated OpenRefine instances for each task (your existing OpenRefine data will not be touched) +* dedicated OpenRefine instance with temporary workspace (your existing OpenRefine data will not be touched) * prevent unnecessary work by fingerprinting generated files and their sources * the [openrefine-client](https://github.com/opencultureconsulting/openrefine-client) used here supports many core features of OpenRefine: * import CSV, TSV, line-based TXT, fixed-width TXT, JSON or XML (and specify input options) * apply [undo/redo history](https://docs.openrefine.org/manual/running/#reusing-operations) from given JSON file(s) * export to CSV, TSV, HTML, XLS, XLSX, ODS * [templating export](https://github.com/opencultureconsulting/openrefine-client#templating) to additional formats like JSON or XML - * works with OpenRefine 2.7, 2.8, 3.0, 3.1, 3.2, 3.3, 3.4, 3.4.1 and 3.5 + * works with OpenRefine 2.7, 2.8, 3.0, 3.1, 3.2, 3.3, 3.4 and 3.5 * tasks are easy to extend with additional commands (e.g. to download input data or validate results) ## Typical workflow @@ -26,10 +25,10 @@ Templates for OpenRefine batch processing (import, transform, export) using the **Possible automation benefits:** -* When you receive updated data (in the same structure), you just need to drop the file and start the task like this: +* When you receive updated data (in the same structure), you just need to drop the input file and start the task like this: ```sh - task example-doaj + task ``` * The entire data processing (including options during import) becomes reproducible. The task configuration file can also be used for documentation through source code comments. @@ -38,18 +37,17 @@ Templates for OpenRefine batch processing (import, transform, export) using the ## Requirements -* GNU/Linux (tested with Fedora 32) +* GNU/Linux (tested with Fedora 34) * JAVA 8+ (for OpenRefine) ## Demo via binder -[![Binder](https://notebooks.gesis.org/binder/badge_logo.svg)](https://notebooks.gesis.org/binder/v2/gh/opencultureconsulting/openrefine-task-runner/main?urlpath=lab/tree/demo.ipynb) +[![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/felixlohmeier/openrefineder/master) - free to use on-demand server with Jupyterlab and Bash Kernel - OpenRefine, openrefine-client and go-task [preinstalled](binder/postBuild) - no registration needed, will start within a few minutes -- [restricted](https://notebooks.gesis.org/faq/) to 4 GB RAM and server will be deleted after 10 minutes of inactivity -- service is provided by GESIS and is intended for use by social scientists +- [restricted](https://mybinder.readthedocs.io/en/latest/about/about.html#how-much-memory-am-i-given-when-using-binder) to 2 GB RAM and server will be deleted after 10 minutes of inactivity ## Install @@ -60,7 +58,7 @@ Templates for OpenRefine batch processing (import, transform, export) using the cd openrefine-task-runner ``` -2. Install [Task 3.10.0](https://github.com/go-task/task/releases/tag/v3.10.0) +2. Install [Task 3.10.0](https://github.com/go-task/task/releases/tag/v3.10.0)+ a) RPM-based (Fedora, CentOS, SLES, etc.) @@ -84,34 +82,28 @@ Templates for OpenRefine batch processing (import, transform, export) using the ## Usage -* Run all tasks in parallel +* Run default task (start, import, transform, export, stats, check, kill and cleanup) ```sh - task + task default ``` -* Run a specific task +* Override settings with environment variables ```sh - task example-duplicates:main - ``` - -* Run some tasks in parallel - - ```sh - task --parallel example-duplicates:main example-doaj:main + OPENREFINE_MEMORY=2000M OPENREFINE_PORT=3334 task default ``` * Force run a task even when the task is up-to-date ```sh - task example-duplicates:main --force + task default --force ``` * Dry-run in verbose mode for debugging ```sh - task example-duplicates:main --dry --verbose --force + task default --dry --verbose --force ``` * List available tasks @@ -120,17 +112,9 @@ Templates for OpenRefine batch processing (import, transform, export) using the task --list ``` -### How to develop your own tasks +### Examples -(first draft, will be elaborated later) - -1. create a new folder -2. copy an example Taskfile.yml -3. provide input data in subdirectory input -4. provide OpenRefine transformation history files in subdirectory config -5. add commands to specific Taskfile (check openrefine-client help screen for available options: `openrefine/client --help`) -6. add project to general Taskfile -7. check memory load and increase RAM if needed +* [noah-biejournals](https://github.com/opencultureconsulting/noah-biejournals): Harvesting des Zeitschriftenservers BieJournals der UB Bielefeld und Transformation in METS/MODS für das Portal noah.nrw ### Getting help diff --git a/Taskfile.yml b/Taskfile.yml index 46c452b..e34731c 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -1,102 +1,109 @@ -# https://github.com/opencultureconsulting/openrefine-task-runner - version: '3' -includes: - example-doaj: example-doaj - example-duplicates: example-duplicates - example-powerhouse: example-powerhouse - # add the directory name of your project here - silent: true -output: prefixed env: - OPENREFINE: - sh: readlink -m .openrefine/refine - CLIENT: - sh: readlink -m .openrefine/client + OPENREFINE_MEMORY: 5120M + OPENREFINE_PORT: 3333 + OPENREFINE_APPDIR: + sh: readlink -m .openrefine + OPENREFINE_TMPDIR: + sh: mktemp -d tasks: default: - desc: execute all projects in parallel - deps: - - task: example-doaj:refine - - task: example-duplicates:refine - - task: example-powerhouse:refine - # add the directory name of your project here + desc: run tasks start, import, transform, export, stats, check, kill and cleanup cmds: - - task: check + - defer: { task: cleanup } # will run even when one of the following commands fail + - task: start + - defer: { task: kill } # will run before cleanup + - defer: { task: check } # will run before kill + - defer: { task: stats } # will run before check + - task: import + - task: transform + - task: export + sources: + - Taskfile.yml + - input/** + - config/** + generates: + - output/** install: - desc: (re)install OpenRefine and openrefine-client into subdirectory .openrefine + desc: install OpenRefine and openrefine-client into subdirectory ${OPENREFINE_APPDIR} cmds: - - | # delete existing install and recreate folder - rm -rf .openrefine - mkdir -p .openrefine - - > # download OpenRefine archive - wget --no-verbose -O openrefine.tar.gz - https://github.com/OpenRefine/OpenRefine/releases/download/3.5.2/openrefine-linux-3.5.2.tar.gz - - | # install OpenRefine into subdirectory .openrefine - tar -xzf openrefine.tar.gz -C .openrefine --strip 1 - rm openrefine.tar.gz + - mkdir -p "${OPENREFINE_APPDIR}" + - | # install OpenRefine into subdirectory ${OPENREFINE_APPDIR} + wget --no-verbose -O openrefine.tar.gz https://github.com/OpenRefine/OpenRefine/releases/download/3.5.2/openrefine-linux-3.5.2.tar.gz + tar -xzf openrefine.tar.gz -C "${OPENREFINE_APPDIR}" --strip 1 && rm openrefine.tar.gz - | # optimize OpenRefine for batch processing - sed -i 's/cd `dirname $0`/cd "$(dirname "$0")"/' ".openrefine/refine" # fix path issue in OpenRefine startup file - sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' ".openrefine/refine.ini" # do not try to open OpenRefine in browser - sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1440/' ".openrefine/refine.ini" # set autosave period from 5 minutes to 25 hours - - > # download openrefine-client into subdirectory .openrefine - wget --no-verbose -O .openrefine/client - https://github.com/opencultureconsulting/openrefine-client/releases/download/v0.3.10/openrefine-client_0-3-10_linux - - chmod +x .openrefine/client # make client executable + sed -i 's/cd `dirname $0`/cd "$(dirname "$0")"/' "${OPENREFINE_APPDIR}/refine" # fix path issue in OpenRefine startup file + sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' "${OPENREFINE_APPDIR}/refine.ini" # do not try to open OpenRefine in browser + sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1440/' "${OPENREFINE_APPDIR}/refine.ini" # set autosave period from 5 minutes to 25 hours + - | # install openrefine-client into subdirectory ${OPENREFINE_APPDIR} + wget --no-verbose -O "${OPENREFINE_APPDIR}/client" https://github.com/opencultureconsulting/openrefine-client/releases/download/v0.3.10/openrefine-client_0-3-10_linux + chmod +x "${OPENREFINE_APPDIR}/client" start: - dir: ./{{.DIR}} - cmds: - - | # verify that OpenRefine is installed - if [ ! -f "$OPENREFINE" ]; then + - | # requirement OpenRefine + if [ ! -f "${OPENREFINE_APPDIR}/refine" ]; then echo 1>&2 "OpenRefine missing; try task install"; exit 1 fi - - | # delete temporary files and log file of previous run - rm -rf ./*.project* workspace.json - rm -rf "{{.PROJECT}}.log" - - > # launch OpenRefine with specific data directory and redirect its output to a log file - "$OPENREFINE" -v warn -p {{.PORT}} -m {{.RAM}} - -d ../{{.DIR}} - >> "{{.PROJECT}}.log" 2>&1 & + - | # launch OpenRefine with specific data directory and redirect its output to a log file + "${OPENREFINE_APPDIR}/refine" -v warn -p "$OPENREFINE_PORT" -m "$OPENREFINE_MEMORY" -d "${OPENREFINE_TMPDIR}" >> "${OPENREFINE_TMPDIR}/log.txt" 2>&1 & - | # wait until OpenRefine API is available - timeout 30s bash -c "until - wget -q -O - -o /dev/null http://localhost:{{.PORT}} | cat | grep -q -o OpenRefine - do sleep 1 - done" + timeout 30s bash -c "until wget -q -O - -o /dev/null http://localhost:${OPENREFINE_PORT} | cat | grep -q -o OpenRefine; do sleep 1; done" - stop: - dir: ./{{.DIR}} - cmds: - - | # shut down OpenRefine gracefully - PID=$(lsof -t -i:{{.PORT}}) - kill $PID - while ps -p $PID > /dev/null; do sleep 1; done - - > # archive the OpenRefine project - tar cfz - "{{.PROJECT}}.openrefine.tar.gz" - -C $(grep -l "{{.PROJECT}}" *.project/metadata.json | cut -d '/' -f 1) - . - - rm -rf ./*.project* workspace.json # delete temporary files + import: + - | # import (requires absolute path) + "${OPENREFINE_APPDIR}/client" \ + --create "$(readlink -m input/duplicates.csv)" \ + --projectName myproject \ + > >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1 - kill: - dir: ./{{.DIR}} - cmds: - - | # shut down OpenRefine immediately to save time and disk space - PID=$(lsof -t -i:{{.PORT}}) - kill -9 $PID - while ps -p $PID > /dev/null; do sleep 1; done - - rm -rf ./*.project* workspace.json # delete temporary files + transform: + - | # apply undo/redo history + for f in config/*.json; do + "${OPENREFINE_APPDIR}/client" myproject --apply "$f" \ + > >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1 + done + + export: + - mkdir -p output + - | # export to TSV + "${OPENREFINE_APPDIR}/client" myproject \ + --output "$(readlink -m output/deduped.tsv)" \ + > >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1 + + stats: + - | # print RAM and CPU usage + PID="$(lsof -t -i:${OPENREFINE_PORT})" + echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" \ + > >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1 + echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" \ + > >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1 check: - desc: check OpenRefine log for any warnings and exit on error - dir: ./{{.DIR}} - cmds: - - | # find log file(s) and check for "exception" or "error" - if grep -i 'exception\|error' $(find . -name '*.log'); then - echo 1>&2 "log contains warnings!"; exit 1 + - | # check log file for any warnings + if grep -i 'exception\|error' "${OPENREFINE_TMPDIR}/log.txt" + then echo 1>&2 "log contains warnings!"; echo; cat "${OPENREFINE_TMPDIR}/log.txt"; exit 1 fi + + stop: + - | # shut down OpenRefine gracefully + PID="$(lsof -t -i:${OPENREFINE_PORT})" + kill $PID; while ps -p $PID > /dev/null; do sleep 1; done + + kill: + - | # shut down OpenRefine immediately to save time + PID="$(lsof -t -i:${OPENREFINE_PORT})" + kill -9 $PID + + cleanup: rm -rf "${OPENREFINE_TMPDIR}" + + git: + desc: commit and push if something changed + cmds: + - git add -A + - git commit -m "latest change $(date -u)" || exit 0 + - git push diff --git a/example-duplicates/config/duplicates-deletion.json b/config/duplicates-deletion.json similarity index 100% rename from example-duplicates/config/duplicates-deletion.json rename to config/duplicates-deletion.json diff --git a/demo.ipynb b/demo.ipynb deleted file mode 100644 index 79839ce..0000000 --- a/demo.ipynb +++ /dev/null @@ -1 +0,0 @@ -{"cells":[{"metadata":{},"cell_type":"markdown","source":"## Run all tasks in parallel"},{"metadata":{"trusted":true},"cell_type":"code","source":"task","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"## Run a specific task"},{"metadata":{"trusted":true},"cell_type":"code","source":"task example-duplicates:main","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"## Run some tasks in parallel"},{"metadata":{"trusted":true},"cell_type":"code","source":"task --parallel example-duplicates:main example-doaj:main","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"## Force run a task even when the task is up-to-date"},{"metadata":{"trusted":true},"cell_type":"code","source":"task example-duplicates:main --force","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"## Dry-run in verbose mode for debugging"},{"metadata":{"trusted":true},"cell_type":"code","source":"task example-duplicates:main --dry --verbose --force","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"## List available tasks"},{"metadata":{"trusted":true},"cell_type":"code","source":"task --list","execution_count":null,"outputs":[]}],"metadata":{"kernelspec":{"name":"bash","display_name":"Bash","language":"bash"},"language_info":{"name":"bash","codemirror_mode":"shell","mimetype":"text/x-sh","file_extension":".sh"}},"nbformat":4,"nbformat_minor":5} \ No newline at end of file diff --git a/example-doaj/Taskfile.yml b/example-doaj/Taskfile.yml deleted file mode 100644 index 8151592..0000000 --- a/example-doaj/Taskfile.yml +++ /dev/null @@ -1,70 +0,0 @@ -version: '3' - -tasks: - main: - desc: Library Carpentry Lesson covering DOAJ - vars: - DIR: '{{splitList ":" .TASK | first}}' # results in the task namespace, which is identical to the directory name - cmds: - - task: refine - - task: :check # check OpenRefine log for any warnings and exit on error - vars: {DIR: '{{.DIR}}'} - - refine: - dir: ./{{.DIR}} - vars: - DIR: '{{splitList ":" .TASK | first}}' - PROJECT: doaj - PORT: 3334 # assign a different port for each project - RAM: 2048M # maximum RAM for OpenRefine java heap space - LOG: '>(tee -a "{{.PROJECT}}.log") 2>&1' # be careful when making changes here, as the path to the log file should match the server log (see main task "start") - deps: - - task: download # will be executed each run independent of up-to-date check - cmds: - - task: :start # launch OpenRefine - vars: {DIR: '{{.DIR}}', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'} - - > # import file - "$CLIENT" -P {{.PORT}} - --create "$(readlink -m input/doaj-article-sample.csv)" - --projectName "{{.PROJECT}}" - > {{.LOG}} - - > # apply transformation rules - "$CLIENT" -P {{.PORT}} "{{.PROJECT}}" - --apply config/doaj-openrefine.json - > {{.LOG}} - - mkdir -p output - - > # export to file - "$CLIENT" -P {{.PORT}} "{{.PROJECT}}" - --output "$(readlink -m output/doaj-results.tsv)" - > {{.LOG}} - - | # print allocated system resources - PID="$(lsof -t -i:{{.PORT}})" - echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" > {{.LOG}} - echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" > {{.LOG}} - - task: :stop # shut down OpenRefine and archive the OpenRefine project - vars: {DIR: '{{.DIR}}', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} - sources: - - Taskfile.yml - - input/** - - config/** - generates: - - ./{{.PROJECT}}.openrefine.tar.gz - - output/** - ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141 - - download: - dir: ./{{.DIR}} - vars: - DIR: '{{splitList ":" .TASK | first}}' - cmds: - - mkdir -p input config - - > # Download input - wget --no-verbose -O input/doaj-article-sample.csv - https://github.com/felixlohmeier/openrefine-kimws2019/raw/master/doaj-article-sample.csv - - > # Download config - wget --no-verbose -O config/doaj-openrefine.json - https://github.com/felixlohmeier/openrefine-kimws2019/raw/master/doaj-openrefine.json - - default: # enable standalone execution (running `task` in project directory) - cmds: - - DIR="${PWD##*/}:main" && cd .. && task "$DIR" diff --git a/example-duplicates/Taskfile.yml b/example-duplicates/Taskfile.yml deleted file mode 100644 index f541804..0000000 --- a/example-duplicates/Taskfile.yml +++ /dev/null @@ -1,56 +0,0 @@ -version: '3' - -tasks: - main: - desc: Removing duplicates in a very small test dataset - vars: - DIR: '{{splitList ":" .TASK | first}}' # results in the task namespace, which is identical to the directory name - cmds: - - task: refine - - task: :check # check OpenRefine log for any warnings and exit on error - vars: {DIR: '{{.DIR}}'} - - refine: - dir: ./{{.DIR}} - vars: - DIR: '{{splitList ":" .TASK | first}}' - PROJECT: duplicates - PORT: 3335 # assign a different port for each project - RAM: 2048M # maximum RAM for OpenRefine java heap space - LOG: '>(tee -a "{{.PROJECT}}.log") 2>&1' # be careful when making changes here, as the path to the log file should match the server log (see main task "start") - cmds: - - task: :start # launch OpenRefine - vars: {DIR: '{{.DIR}}', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'} - - > # import file - "$CLIENT" -P {{.PORT}} - --create "$(readlink -m input/duplicates.csv)" - --encoding UTF-8 - --projectName "{{.PROJECT}}" - > {{.LOG}} - - > # apply transformation rules - "$CLIENT" -P {{.PORT}} "{{.PROJECT}}" - --apply config/duplicates-deletion.json - > {{.LOG}} - - mkdir -p output - - > # export to file - "$CLIENT" -P {{.PORT}} "{{.PROJECT}}" - --output "$(readlink -m output/deduped.xls)" - > {{.LOG}} - - | # print allocated system resources - PID="$(lsof -t -i:{{.PORT}})" - echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" > {{.LOG}} - echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" > {{.LOG}} - - task: :stop # shut down OpenRefine and archive the OpenRefine project - vars: {DIR: '{{.DIR}}', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} - sources: - - Taskfile.yml - - input/** - - config/** - generates: - - ./{{.PROJECT}}.openrefine.tar.gz - - output/** - ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141 - - default: # enable standalone execution (running `task` in project directory) - cmds: - - DIR="${PWD##*/}:main" && cd .. && task "$DIR" diff --git a/example-powerhouse/Taskfile.yml b/example-powerhouse/Taskfile.yml deleted file mode 100644 index f3afd2f..0000000 --- a/example-powerhouse/Taskfile.yml +++ /dev/null @@ -1,72 +0,0 @@ -version: '3' - -tasks: - main: - desc: Powerhouse Museum Tutorial - vars: - DIR: '{{splitList ":" .TASK | first}}' # results in the task namespace, which is identical to the directory name - cmds: - - task: refine - - task: :check # check OpenRefine log for any warnings and exit on error - vars: {DIR: '{{.DIR}}'} - - refine: - dir: ./{{.DIR}} - vars: - DIR: '{{splitList ":" .TASK | first}}' - PROJECT: phm - PORT: 3336 # assign a different port for each project - RAM: 2048M # maximum RAM for OpenRefine java heap space - LOG: '>(tee -a "{{.PROJECT}}.log") 2>&1' # be careful when making changes here, as the path to the log file should match the server log (see main task "start") - deps: - - task: download # will be executed each run independent of up-to-date check - cmds: - - task: :start # launch OpenRefine - vars: {DIR: '{{.DIR}}', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'} - - > # import file - "$CLIENT" -P {{.PORT}} - --create "$(readlink -m input/phm-collection.tsv)" - --processQuotes false - --guessCellValueTypes true - --projectName "{{.PROJECT}}" - > {{.LOG}} - - > # apply transformation rules - "$CLIENT" -P {{.PORT}} "{{.PROJECT}}" - --apply config/phm-transform.json - > {{.LOG}} - - mkdir -p output - - > # export to file - "$CLIENT" -P {{.PORT}} "{{.PROJECT}}" - --output "$(readlink -m output/phm-results.tsv)" - > {{.LOG}} - - | # print allocated system resources - PID="$(lsof -t -i:{{.PORT}})" - echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" > {{.LOG}} - echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" > {{.LOG}} - - task: :stop # shut down OpenRefine and archive the OpenRefine project - vars: {DIR: '{{.DIR}}', PORT: '{{.PORT}}', PROJECT: '{{.PROJECT}}'} - sources: - - Taskfile.yml - - input/** - - config/** - generates: - - ./{{.PROJECT}}.openrefine.tar.gz - - output/** - ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141 - - download: - dir: ./{{.DIR}} - vars: - DIR: '{{splitList ":" .TASK | first}}' - cmds: - - mkdir -p input config - - > # Download input - wget --no-verbose -O input/phm-collection.tsv - https://github.com/opencultureconsulting/openrefine-batch/raw/master/examples/powerhouse-museum/input/phm-collection.tsv - - > # Download config - wget --no-verbose -O config/phm-transform.json - https://github.com/opencultureconsulting/openrefine-batch/raw/master/examples/powerhouse-museum/config/phm-transform.json - - default: # enable standalone execution (running `task` in project directory) - cmds: - - DIR="${PWD##*/}:main" && cd .. && task "$DIR" diff --git a/example-duplicates/input/duplicates.csv b/input/duplicates.csv similarity index 100% rename from example-duplicates/input/duplicates.csv rename to input/duplicates.csv