#!/bin/bash # openrefine-batch-docker.sh, Felix Lohmeier, v1.16, 2021-11-09 # https://github.com/felixlohmeier/openrefine-batch # check system requirements DOCKER="$(command -v docker 2> /dev/null)" if [ -z "$DOCKER" ] ; then echo 1>&2 "This action requires you to have 'docker' installed and present in your PATH. You can download it for free at http://www.docker.com/" exit 1 fi DOCKERINFO="$(docker info 2>/dev/null | grep 'Server Version')" if [ -z "$DOCKERINFO" ] then echo "command 'docker info' failed, trying again with sudo..." DOCKERINFO="$(sudo docker info 2>/dev/null | grep 'Server Version')" echo "OK" docker=(sudo docker) if [ -z "$DOCKERINFO" ] ; then echo 1>&2 "This action requires you to start the docker daemon. Try 'sudo systemctl start docker' or 'sudo start docker'. If the docker daemon is already running then maybe some security privileges are missing to run docker commands.'" exit 1 fi else docker=(docker) fi # help screen function usage () { cat </dev/null)); fi ;; b ) configdir=$(readlink -f ${OPTARG}); if [ -n "${configdir// }" ] ; then jsonfiles=($(find -L "${configdir}"/* -type f -printf "%f\n" 2>/dev/null)); fi ;; c ) outputdir=$(readlink -m ${OPTARG}); mkdir -p "${outputdir}" ;; d ) crossdir=$(readlink -f ${OPTARG}); if [ -n "${crossdir// }" ] ; then crossprojects=($(find -L "${crossdir}"/* -maxdepth 0 -type d -printf "%f\n" 2>/dev/null)); fi ;; e ) format="${OPTARG}" ; exportformat="${OPTARG}" ;; f ) format="${OPTARG}" ; inputformat="--format=${OPTARG}" ;; i ) inputoptions+=("--${OPTARG}") ;; m ) ram=${OPTARG} ;; t ) templating+=("--${OPTARG}") ; exportformat="txt" ;; v ) version=${OPTARG} ;; E ) export="false" ;; R ) restarttransform="false" ;; X ) restartfile="false" ;; h ) usage ;; \? ) echo 1>&2 "Unknown option: -$OPTARG"; usage; exit 1;; : ) echo 1>&2 "Missing option argument for -$OPTARG"; usage; exit 1;; * ) echo 1>&2 "Unimplemented option: -$OPTARG"; usage; exit 1;; esac done shift $((OPTIND - 1)) # check for mandatory options if [ -z "$outputdir" ]; then echo 1>&2 "please provide path to directory for exported files (and OpenRefine workspace)" echo 1>&2 "example: ./openrefine-batch-docker.sh -c output/" exit 1 fi if [ "$(ls -A "$outputdir" 2>/dev/null)" ];then echo 1>&2 "path to directory for exported files (and OpenRefine workspace) is not empty" echo 1>&2 "$outputdir" exit 1 fi if [ "$format" = "xml" ] || [ "$format" = "json" ] && [ -z "$inputoptions" ]; then echo 1>&2 "error: you specified the inputformat $format but did not provide mandatory input options" echo 1>&2 "please provide recordpath in multiple arguments without slashes" echo 1>&2 "example: ./openrefine-batch-docker.sh ... -f $format -i recordPath=collection -i recordPath=record" exit 1 fi if [ "$format" = "fixed-width" ] && [ -z "$inputoptions" ]; then echo 1>&2 "error: you specified the inputformat $format but did not provide mandatory input options" echo 1>&2 "please provide column widths separated by comma (e.g. 7,5)" echo 1>&2 "example: ./openrefine-batch-docker.sh ... -f $format -i columnWidths=7,5" exit 1 fi if [ "$format" = "xlsx" ] || [ "$format" = "ods" ] && [ -z "$inputoptions" ]; then echo 1>&2 "error: you specified the inputformat $format but did not provide mandatory input options" echo 1>&2 "please provide sheets separated by comma (e.g. 0,1), default: 0 (first sheet)" echo 1>&2 "example: ./openrefine-batch-docker.sh ... -f $format -i sheets=0" exit 1 fi # print variables uuid=$(cat /proc/sys/kernel/random/uuid) echo "Input directory: $inputdir" echo "Input files: ${inputfiles[*]}" echo "Input format: $inputformat" echo "Input options: ${inputoptions[*]}" echo "Config directory: $configdir" echo "Transformation rules: ${jsonfiles[*]}" echo "Cross directory: $crossdir" echo "Cross projects: ${crossprojects[*]}" echo "OpenRefine heap space: $ram" echo "OpenRefine version: $version" echo "OpenRefine workspace: $outputdir" echo "Export to workspace: $export" echo "Export format: $exportformat" echo "Templating options: ${templating[*]}" echo "Docker container name: $uuid" echo "restart after file: $restartfile" echo "restart after transform: $restarttransform" echo "" # declare additional variables checkpoints=${#checkpointdate[@]} checkpointdate[$((checkpoints + 1))]=$(date +%s) checkpointname[$((checkpoints + 1))]="Start process" memoryload=() # safe cleanup handler cleanup() { echo "cleanup..." ${docker[*]} stop -t=5000 ${uuid} ${docker[*]} rm ${uuid} rm -r -f "${outputdir:?}"/workspace*.json # delete duplicates from copied projects if [ -n "$crossprojects" ]; then for i in "${crossprojects[@]}" ; do rm -r -f "${outputdir}/${i}" ; done fi } trap "cleanup;exit" SIGHUP SIGINT SIGQUIT SIGTERM # launch server checkpoints=${#checkpointdate[@]} checkpointdate[$((checkpoints + 1))]=$(date +%s) checkpointname[$((checkpoints + 1))]="Launch OpenRefine" echo "=== $checkpoints. ${checkpointname[$((checkpoints + 1))]} ===" echo "" echo "starting time: $(date --date=@${checkpointdate[$((checkpoints + 1))]})" echo "" ${docker[*]} run -d --name=${uuid} -v ${outputdir}:/data:z felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data # wait until server is available until ${docker[*]} run --rm --link ${uuid} --entrypoint /usr/bin/curl felixlohmeier/openrefine-client:v0.3.10 --silent -N http://${uuid}:3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done # show server logs ${docker[*]} attach ${uuid} & echo "" # import all files if [ -n "$inputfiles" ]; then checkpoints=${#checkpointdate[@]} checkpointdate[$((checkpoints + 1))]=$(date +%s) checkpointname[$((checkpoints + 1))]="Import all files" echo "=== $checkpoints. ${checkpointname[$((checkpoints + 1))]} ===" echo "" echo "starting time: $(date --date=@${checkpointdate[$((checkpoints + 1))]})" echo "" for inputfile in "${inputfiles[@]}" ; do echo "import ${inputfile}..." # run client with input command ${docker[*]} run --rm --link ${uuid} -v ${inputdir}:/data:z felixlohmeier/openrefine-client:v0.3.10 -H ${uuid} -c $inputfile $inputformat ${inputoptions[@]} # show allocated system resources ps -o start,etime,%mem,%cpu,rss -C java --sort=start memoryload+=($(ps --no-headers -o rss -C java)) echo "" # restart server to clear memory if [ "$restartfile" = "true" ]; then echo "save project and restart OpenRefine server..." ${docker[*]} stop -t=5000 ${uuid} ${docker[*]} rm ${uuid} ${docker[*]} run -d --name=${uuid} -v ${outputdir}:/data:z felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data until ${docker[*]} run --rm --link ${uuid} --entrypoint /usr/bin/curl felixlohmeier/openrefine-client:v0.3.10 --silent -N http://${uuid}:3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done ${docker[*]} attach ${uuid} & echo "" fi done fi # transform and export files if [ -n "$jsonfiles" ] || [ "$export" = "true" ]; then checkpoints=${#checkpointdate[@]} checkpointdate[$((checkpoints + 1))]=$(date +%s) checkpointname[$((checkpoints + 1))]="Prepare transform & export" echo "=== $checkpoints. ${checkpointname[$((checkpoints + 1))]} ===" echo "" echo "starting time: $(date --date=@${checkpointdate[$((checkpoints + 1))]})" echo "" # get project ids echo "get project ids..." ${docker[*]} run --rm --link ${uuid} felixlohmeier/openrefine-client:v0.3.10 -H ${uuid} -l > "${outputdir}/projects.tmp" projectids=($(cut -c 2-14 "${outputdir}/projects.tmp")) projectnames=($(cut -c 17- "${outputdir}/projects.tmp")) cat "${outputdir}/projects.tmp" && rm "${outputdir:?}/projects.tmp" echo "" # provide additional OpenRefine projects for cross function if [ -n "$crossprojects" ]; then echo "provide additional projects for cross function..." # copy given projects to workspace rsync -a --exclude='*.project/history' "${crossdir}"/*.project "${outputdir}" # restart server to advertise copied projects echo "restart OpenRefine server to advertise copied projects..." ${docker[*]} stop -t=5000 ${uuid} ${docker[*]} rm ${uuid} ${docker[*]} run -d --name=${uuid} -v ${outputdir}:/data:z felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data until ${docker[*]} run --rm --link ${uuid} --entrypoint /usr/bin/curl felixlohmeier/openrefine-client:v0.3.10 --silent -N http://${uuid}:3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done ${docker[*]} attach ${uuid} & echo "" fi # loop for all projects for ((i=0;i<${#projectids[@]};++i)); do # apply transformation rules if [ -n "$jsonfiles" ]; then checkpoints=${#checkpointdate[@]} checkpointdate[$((checkpoints + 1))]=$(date +%s) checkpointname[$((checkpoints + 1))]="Transform ${projectnames[i]}" echo "=== $checkpoints. ${checkpointname[$((checkpoints + 1))]} ===" echo "" echo "starting time: $(date --date=@${checkpointdate[$((checkpoints + 1))]})" echo "" for jsonfile in "${jsonfiles[@]}" ; do echo "transform ${jsonfile}..." # run client with apply command ${docker[*]} run --rm --link ${uuid} -v ${configdir}:/data:z felixlohmeier/openrefine-client:v0.3.10 -H ${uuid} -f ${jsonfile} ${projectids[i]} # allocated system resources ps -o start,etime,%mem,%cpu,rss -C java --sort=start memoryload+=($(ps --no-headers -o rss -C java)) echo "" # restart server to clear memory if [ "$restarttransform" = "true" ]; then echo "save project and restart OpenRefine server..." ${docker[*]} stop -t=5000 ${uuid} ${docker[*]} rm ${uuid} ${docker[*]} run -d --name=${uuid} -v ${outputdir}:/data:z felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data until ${docker[*]} run --rm --link ${uuid} --entrypoint /usr/bin/curl felixlohmeier/openrefine-client:v0.3.10 --silent -N http://${uuid}:3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done ${docker[*]} attach ${uuid} & fi echo "" done fi # export project to workspace if [ "$export" = "true" ]; then checkpoints=${#checkpointdate[@]} checkpointdate[$((checkpoints + 1))]=$(date +%s) checkpointname[$((checkpoints + 1))]="Export ${projectnames[i]}" echo "=== $checkpoints. ${checkpointname[$((checkpoints + 1))]} ===" echo "" echo "starting time: $(date --date=@${checkpointdate[$((checkpoints + 1))]})" echo "" # get filename without extension filename=${projectnames[i]%.*} echo "export to file ${filename}.${exportformat}..." # run client with export command ${docker[*]} run --rm --link ${uuid} -v ${outputdir}:/data:z felixlohmeier/openrefine-client:v0.3.10 -H ${uuid} -E --output="${filename}.${exportformat}" "${templating[@]}" ${projectids[i]} # show allocated system resources ps -o start,etime,%mem,%cpu,rss -C java --sort=start memoryload+=($(ps --no-headers -o rss -C java)) echo "" fi # restart server to clear memory if [ "$restartfile" = "true" ]; then echo "restart OpenRefine server..." ${docker[*]} stop -t=5000 ${uuid} ${docker[*]} rm ${uuid} ${docker[*]} run -d --name=${uuid} -v ${outputdir}:/data:z felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data until ${docker[*]} run --rm --link ${uuid} --entrypoint /usr/bin/curl felixlohmeier/openrefine-client:v0.3.10 --silent -N http://${uuid}:3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done ${docker[*]} attach ${uuid} & fi echo "" done # list output files if [ "$export" = "true" ]; then echo "output (number of lines / size in bytes):" wc -c -l "${outputdir}"/*.${exportformat} echo "" fi fi # run cleanup function cleanup echo "" # calculate and print checkpoints echo "=== Statistics ===" echo "" checkpoints=${#checkpointdate[@]} checkpointdate[$((checkpoints + 1))]=$(date +%s) checkpointname[$((checkpoints + 1))]="End process" echo "starting time and run time of each step:" checkpoints=${#checkpointdate[@]} checkpointdate[$((checkpoints + 1))]=$(date +%s) for i in $(seq 1 $checkpoints); do diffsec="$((${checkpointdate[$((i + 1))]} - ${checkpointdate[$i]}))" printf "%35s $(date --date=@${checkpointdate[$i]}) ($(date -d@${diffsec} -u +%H:%M:%S))\n" "${checkpointname[$i]}" done echo "" diffsec="$((checkpointdate[$checkpoints] - checkpointdate[1]))" echo "total run time: $(date -d@${diffsec} -u +%H:%M:%S) (hh:mm:ss)" # calculate and print memory load max=${memoryload[0]} for n in "${memoryload[@]}" ; do ((n > max)) && max=$n done echo "highest memory load: $((max / 1024)) MB"