#!/bin/bash # openrefine-batch.sh, Felix Lohmeier, v1.4, 2017-08-02 # https://github.com/felixlohmeier/openrefine-batch # declare download URLs for OpenRefine and OpenRefine client openrefine_URL="https://github.com/felixlohmeier/OpenRefine/releases/download/2017-08-02/openrefine-linux-2017-08-02.tar.gz" client_URL="https://github.com/felixlohmeier/openrefine-client/releases/download/v0.3.1/openrefine-client_0-3-1_linux-64bit" # check system requirements JAVA="$(which java 2> /dev/null)" if [ -z "$JAVA" ] ; then echo 1>&2 "This action requires you to have 'Java JRE' installed. You can download it for free at https://java.com" exit 1 fi # check if wget supports option --show-progress (since wget 1.16) wget --help | grep -q '\--show-progress' && wget_opt="--show-progress" || wget_opt="" # autoinstall OpenRefine if [ ! -d "openrefine" ]; then echo "Download OpenRefine..." mkdir -p openrefine wget -q $wget_opt $openrefine_URL echo "Install OpenRefine in subdirectory openrefine..." tar -xzf "$(basename $openrefine_URL)" -C openrefine --strip 1 --totals rm -f "$(basename $openrefine_URL)" sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' openrefine/refine.ini sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1440/' openrefine/refine.ini sed -i 's/-Xms$REFINE_MIN_MEMORY/-Xms$REFINE_MEMORY/' openrefine/refine echo "" fi # autoinstall OpenRefine client if [ ! -d "openrefine-client" ]; then echo "Download OpenRefine client..." mkdir -p openrefine-client wget -q -P openrefine-client $wget_opt $client_URL chmod +x openrefine-client/openrefine-client_0-3-1_linux-64bit echo "" fi # help screen function usage () { cat </dev/null)); fi ;; b ) configdir=$(readlink -f ${OPTARG}); if [ -n "${configdir// }" ] ; then jsonfiles=($(find -L "${configdir}"/* -type f -printf "%f\n" 2>/dev/null)); fi ;; c ) outputdir=$(readlink -m ${OPTARG}); mkdir -p "${outputdir}" ;; d ) crossdir=$(readlink -f ${OPTARG}); if [ -n "${crossdir// }" ] ; then crossprojects=($(find -L "${crossdir}"/* -maxdepth 0 -type d -printf "%f\n" 2>/dev/null)); fi ;; f ) format="${OPTARG}" ; inputformat="--format=${OPTARG}" ;; i ) inputoptions+=("--${OPTARG}") ;; m ) ram=${OPTARG} ;; p ) port=${OPTARG} ;; E ) export="false" ;; R ) restarttransform="false" ;; X ) restartfile="false" ;; h ) usage ;; \? ) echo 1>&2 "Unknown option: -$OPTARG"; usage; exit 1;; : ) echo 1>&2 "Missing option argument for -$OPTARG"; usage; exit 1;; * ) echo 1>&2 "Unimplemented option: -$OPTARG"; usage; exit 1;; esac done shift $(($OPTIND - 1)) # check for mandatory options if [ -z "$outputdir" ]; then echo 1>&2 "please provide path to directory for exported files (and OpenRefine workspace)" echo 1>&2 "example: ./openrefine-batch.sh -c output/" exit 1 fi if [ "$format" = "xml" ] || [ "$format" = "json" ] && [ -z "$inputoptions" ]; then echo 1>&2 "error: you specified the inputformat $format but did not provide mandatory input options" echo 1>&2 "please provide recordpath in multiple arguments without slashes" echo 1>&2 "example: ./openrefine-batch.sh ... -f $format -i recordPath=collection -i recordPath=record" exit 1 fi if [ "$format" = "fixed-width" ] && [ -z "$inputoptions" ]; then echo 1>&2 "error: you specified the inputformat $format but did not provide mandatory input options" echo 1>&2 "please provide column widths separated by comma (e.g. 7,5)" echo 1>&2 "example: ./openrefine-batch.sh ... -f $format -i columnWidths=7,5" exit 1 fi if [ "$format" = "xlsx" ] || [ "$format" = "ods" ] && [ -z "$inputoptions" ]; then echo 1>&2 "error: you specified the inputformat $format but did not provide mandatory input options" echo 1>&2 "please provide sheets separated by comma (e.g. 0,1), default: 0 (first sheet)" echo 1>&2 "example: ./openrefine-batch.sh ... -f $format -i sheets=0" exit 1 fi # print variables echo "Input directory: $inputdir" echo "Input files: ${inputfiles[*]}" echo "Input format: $inputformat" echo "Input options: ${inputoptions[*]}" echo "Config directory: $configdir" echo "Transformation rules: ${jsonfiles[*]}" echo "Cross directory: $crossdir" echo "Cross projects: ${crossprojects[*]}" echo "OpenRefine heap space: $ram" echo "OpenRefine port: $port" echo "OpenRefine workspace: $outputdir" echo "Export TSV to workspace: $export" echo "restart after file: $restartfile" echo "restart after transform: $restarttransform" echo "" # declare additional variables checkpoints=${#checkpointdate[@]} checkpointdate[$(($checkpoints + 1))]=$(date +%s) checkpointname[$(($checkpoints + 1))]="Start process" memoryload=() # launch server checkpoints=${#checkpointdate[@]} checkpointdate[$(($checkpoints + 1))]=$(date +%s) checkpointname[$(($checkpoints + 1))]="Launch OpenRefine" echo "=== $checkpoints. ${checkpointname[$(($checkpoints + 1))]} ===" echo "" echo "starting time: $(date --date=@${checkpointdate[$(($checkpoints + 1))]})" echo "" openrefine/refine -p ${port} -d "${outputdir}" -m ${ram} & pid=$! # wait until server is available until wget -q -O - http://localhost:${port} | cat | grep -q -o "OpenRefine" ; do sleep 1; done echo "" # import all files if [ -n "$inputfiles" ]; then checkpoints=${#checkpointdate[@]} checkpointdate[$(($checkpoints + 1))]=$(date +%s) checkpointname[$(($checkpoints + 1))]="Import all files" echo "=== $checkpoints. ${checkpointname[$(($checkpoints + 1))]} ===" echo "" echo "starting time: $(date --date=@${checkpointdate[$(($checkpoints + 1))]})" echo "" for inputfile in "${inputfiles[@]}" ; do echo "import ${inputfile}..." # run client with input command openrefine-client/openrefine-client_0-3-1_linux-64bit -P ${port} -c ${inputdir}/${inputfile} $inputformat "${inputoptions[@]}" # show allocated system resources ps -o start,etime,%mem,%cpu,rss -p ${pid} --sort=start memoryload+=($(ps --no-headers -o rss -p ${pid})) echo "" # restart server to clear memory if [ "$restartfile" = "true" ]; then echo "save project and restart OpenRefine server..." kill ${pid} wait echo "" openrefine/refine -p ${port} -d "${outputdir}" -m ${ram} & pid=$! until wget -q -O - http://localhost:${port} | cat | grep -q -o "OpenRefine" ; do sleep 1; done echo "" fi done fi # transform and export files if [ -n "$jsonfiles" ] || [ "$export" = "true" ]; then checkpoints=${#checkpointdate[@]} checkpointdate[$(($checkpoints + 1))]=$(date +%s) checkpointname[$(($checkpoints + 1))]="Prepare transform & export" echo "=== $checkpoints. ${checkpointname[$(($checkpoints + 1))]} ===" echo "" echo "starting time: $(date --date=@${checkpointdate[$(($checkpoints + 1))]})" echo "" # get project ids echo "get project ids..." openrefine-client/openrefine-client_0-3-1_linux-64bit -P ${port} -l > "${outputdir}/projects.tmp" projectids=($(cat "${outputdir}/projects.tmp" | cut -c 2-14)) projectnames=($(cat "${outputdir}/projects.tmp" | cut -c 17-)) cat "${outputdir}/projects.tmp" && rm "${outputdir:?}/projects.tmp" echo "" # provide additional OpenRefine projects for cross function if [ -n "$crossprojects" ]; then echo "provide additional projects for cross function..." # copy given projects to workspace rsync -a --exclude='*.project/history' "${crossdir}"/*.project "${outputdir}" # restart server to advertise copied projects echo "restart OpenRefine server to advertise copied projects..." kill ${pid} wait echo "" openrefine/refine -p ${port} -d "${outputdir}" -m ${ram} & pid=$! until wget -q -O - http://localhost:${port} | cat | grep -q -o "OpenRefine" ; do sleep 1; done echo "" fi # loop for all projects for ((i=0;i<${#projectids[@]};++i)); do # apply transformation rules if [ -n "$jsonfiles" ]; then checkpoints=${#checkpointdate[@]} checkpointdate[$(($checkpoints + 1))]=$(date +%s) checkpointname[$(($checkpoints + 1))]="Transform ${projectnames[i]}" echo "=== $checkpoints. ${checkpointname[$(($checkpoints + 1))]} ===" echo "" echo "starting time: $(date --date=@${checkpointdate[$(($checkpoints + 1))]})" echo "" for jsonfile in "${jsonfiles[@]}" ; do echo "transform ${jsonfile}..." # run client with apply command openrefine-client/openrefine-client_0-3-1_linux-64bit -P ${port} -f ${configdir}/${jsonfile} ${projectids[i]} # allocated system resources ps -o start,etime,%mem,%cpu,rss -p ${pid} --sort=start memoryload+=($(ps --no-headers -o rss -p ${pid})) echo "" # restart server to clear memory if [ "$restarttransform" = "true" ]; then echo "save project and restart OpenRefine server..." kill ${pid} wait echo "" openrefine/refine -p ${port} -d "${outputdir}" -m ${ram} & pid=$! until wget -q -O - http://localhost:${port} | cat | grep -q -o "OpenRefine" ; do sleep 1; done fi echo "" done fi # export project to workspace if [ "$export" = "true" ]; then checkpoints=${#checkpointdate[@]} checkpointdate[$(($checkpoints + 1))]=$(date +%s) checkpointname[$(($checkpoints + 1))]="Export ${projectnames[i]}" echo "=== $checkpoints. ${checkpointname[$(($checkpoints + 1))]} ===" echo "" echo "starting time: $(date --date=@${checkpointdate[$(($checkpoints + 1))]})" echo "" # get filename without extension filename=${projectnames[i]%.*} echo "export to file ${filename}.tsv..." # run client with export command openrefine-client/openrefine-client_0-3-1_linux-64bit -P ${port} -E --output="${outputdir}/${filename}.tsv" ${projectids[i]} # show allocated system resources ps -o start,etime,%mem,%cpu,rss -p ${pid} --sort=start memoryload+=($(ps --no-headers -o rss -p ${pid})) echo "" fi # restart server to clear memory if [ "$restartfile" = "true" ]; then echo "restart OpenRefine server..." kill ${pid} wait echo "" openrefine/refine -p ${port} -d "${outputdir}" -m ${ram} & pid=$! until wget -q -O - http://localhost:${port} | cat | grep -q -o "OpenRefine" ; do sleep 1; done fi echo "" done # list output files if [ "$export" = "true" ]; then echo "output (number of lines / size in bytes):" wc -c -l "${outputdir}"/*.tsv echo "" fi fi # cleanup echo "cleanup..." kill ${pid} wait rm -r -f "${outputdir:?}"/workspace*.json # delete duplicates from copied projects if [ -n "$crossprojects" ]; then for i in "${crossprojects[@]}" ; do rm -r -f "${outputdir}/${i}" ; done fi echo "" # calculate and print checkpoints echo "=== Statistics ===" echo "" checkpoints=${#checkpointdate[@]} checkpointdate[$(($checkpoints + 1))]=$(date +%s) checkpointname[$(($checkpoints + 1))]="End process" echo "starting time and run time of each step:" checkpoints=${#checkpointdate[@]} checkpointdate[$(($checkpoints + 1))]=$(date +%s) for i in $(seq 1 $checkpoints); do diffsec="$((${checkpointdate[$(($i + 1))]} - ${checkpointdate[$i]}))" printf "%35s $(date --date=@${checkpointdate[$i]}) ($(date -d@${diffsec} -u +%H:%M:%S))\n" "${checkpointname[$i]}" done echo "" diffsec="$((${checkpointdate[$checkpoints]} - ${checkpointdate[1]}))" echo "total run time: $(date -d@${diffsec} -u +%H:%M:%S) (hh:mm:ss)" # calculate and print memory load max=${memoryload[0]} for n in "${memoryload[@]}" ; do ((n > max)) && max=$n done echo "highest memory load: $(($max / 1024)) MB"