2017-03-14 23:17:33 +01:00
#!/bin/bash
2021-01-04 17:37:49 +01:00
# openrefine-batch-docker.sh, Felix Lohmeier, v1.15, 2021-01-04
2017-03-14 23:17:33 +01:00
# https://github.com/felixlohmeier/openrefine-batch
# check system requirements
2019-08-06 21:21:59 +02:00
DOCKER = " $( command -v docker 2> /dev/null) "
2017-03-14 23:17:33 +01:00
if [ -z " $DOCKER " ] ; then
echo 1>& 2 "This action requires you to have 'docker' installed and present in your PATH. You can download it for free at http://www.docker.com/"
exit 1
fi
DOCKERINFO = " $( docker info 2>/dev/null | grep 'Server Version' ) "
2019-08-06 21:21:59 +02:00
if [ -z " $DOCKERINFO " ]
then
echo "command 'docker info' failed, trying again with sudo..."
DOCKERINFO = " $( sudo docker info 2>/dev/null | grep 'Server Version' ) "
echo "OK"
docker = ( sudo docker)
if [ -z " $DOCKERINFO " ] ; then
echo 1>& 2 "This action requires you to start the docker daemon. Try 'sudo systemctl start docker' or 'sudo start docker'. If the docker daemon is already running then maybe some security privileges are missing to run docker commands.'"
exit 1
fi
else
docker = ( docker)
2017-03-14 23:17:33 +01:00
fi
# help screen
function usage ( ) {
cat <<EOF
2019-08-06 21:21:59 +02:00
Usage: ./openrefine-batch-docker.sh [ -a INPUTDIR] [ -b TRANSFORMDIR] [ -c OUTPUTDIR] ...
2017-03-14 23:17:33 +01:00
= = basic arguments = =
-a INPUTDIR path to directory with source files ( leave empty to transform only ; multiple files may be imported into a single project by providing a zip or tar.gz archive, cf. https://github.com/OpenRefine/OpenRefine/wiki/Importers )
-b TRANSFORMDIR path to directory with OpenRefine transformation rules ( json files, cf. http://kb.refinepro.com/2012/06/google-refine-json-and-my-notepad-or.html ; leave empty to transform only)
-c OUTPUTDIR path to directory for exported files ( and OpenRefine workspace)
= = options = =
-d CROSSDIR path to directory with additional OpenRefine projects ( will be copied to workspace before transformation step to support the cross function , cf. https://github.com/OpenRefine/OpenRefine/wiki/GREL-Other-Functions )
2017-10-28 00:47:51 +02:00
-e EXPORTFORMAT ( csv, tsv, html, xls, xlsx, ods)
2017-03-14 23:17:33 +01:00
-f INPUTFORMAT ( csv, tsv, xml, json, line-based, fixed-width, xlsx, ods)
-i INPUTOPTIONS several options provided by openrefine-client, see below...
-m RAM maximum RAM for OpenRefine java heap space ( default: 2048M)
2017-12-11 21:57:48 +01:00
-t TEMPLATING several options for templating export, see below...
2021-01-04 17:37:49 +01:00
-v VERSION OpenRefine version ( 3.4.1, 3.4, 3.3, 3.2, 3.1, 3.0, 2.8, 2.7, ...; default: 3.4.1)
2017-03-14 23:17:33 +01:00
-E do NOT export files
-R do NOT restart OpenRefine after each transformation ( e.g. config file)
-X do NOT restart OpenRefine after each project ( e.g. input file)
-h displays this help screen
= = inputoptions ( mandatory for xml, json, fixed-width, xslx, ods) = =
2017-12-11 21:57:48 +01:00
-i recordPath = RECORDPATH ( xml, json) : please provide path in multiple arguments without slashes, e.g. /collection/record/ should be entered like this: -i recordPath = collection -i recordPath = record, default xml: record, default json: _ _
2017-03-14 23:17:33 +01:00
-i columnWidths = COLUMNWIDTHS ( fixed-width) : please provide widths separated by comma ( e.g. 7,5)
2017-12-11 21:57:48 +01:00
-i sheets = SHEETS ( xls, xlsx, ods) : please provide sheets separated by comma ( e.g. 0,1) , default: 0 ( first sheet)
2017-03-14 23:17:33 +01:00
= = more inputoptions ( optional, only together with inputformat) = =
2017-12-11 21:57:48 +01:00
-i projectName = PROJECTNAME ( all formats) , default: filename
2017-03-14 23:17:33 +01:00
-i limit = LIMIT ( all formats) , default: -1
2017-12-11 21:57:48 +01:00
-i includeFileSources = true/false ( all formats) , default: false
-i trimStrings = true/false ( xml, json) , default: false
-i storeEmptyStrings = true/false ( xml, json) , default: true
-i guessCellValueTypes = true/false ( xml, csv, tsv, fixed-width, json) , default: false
2017-03-14 23:17:33 +01:00
-i encoding = ENCODING ( csv, tsv, line-based, fixed-width) , please provide short encoding name ( e.g. UTF-8)
2017-12-11 21:57:48 +01:00
-i ignoreLines = IGNORELINES ( csv, tsv, line-based, fixed-width, xls, xlsx, ods) , default: -1
-i headerLines = HEADERLINES ( csv, tsv, fixed-width, xls, xlsx, ods) , default: 1, default fixed-width: 0
-i skipDataLines = true/false ( csv, tsv, line-based, fixed-width, xls, xlsx, ods) , default: 0, default line-based: -1
-i storeBlankRows = true/false ( csv, tsv, line-based, fixed-width, xls, xlsx, ods) , default: true
-i processQuotes = true/false ( csv, tsv) , default: true
-i storeBlankCellsAsNulls = true/false ( csv, tsv, line-based, fixed-width, xls, xlsx, ods) , default: true
2017-03-14 23:17:33 +01:00
-i linesPerRow = LINESPERROW ( line-based) , default: 1
2017-12-11 21:57:48 +01:00
= = templating options ( alternative exportformat) = =
-t template = TEMPLATE ( mandatory; ( big) text string that you enter in the *row template* textfield in the export/templating menu in the browser app)
-t mode = row-based/record-based ( engine mode, default: row-based)
-t prefix = PREFIX ( text string that you enter in the *prefix* textfield in the browser app)
-t rowSeparator = ROWSEPARATOR ( text string that you enter in the *row separator* textfield in the browser app)
-t suffix = SUFFIX ( text string that you enter in the *suffix* textfield in the browser app)
-t filterQuery = REGEX ( Simple RegEx text filter on filterColumn, e.g. ^12015$)
-t filterColumn = COLUMNNAME ( column name for filterQuery, default: name of first column)
-t facets = FACETS ( facets config in json format, may be extracted with browser dev tools in browser app)
-t splitToFiles = true/false ( will split each row/record into a single file; it specifies a presumably unique character series for splitting; prefix and suffix will be applied to all files
-t suffixById = true/false ( enhancement option for splitToFiles; will generate filename-suffix from values in key column)
= = examples = =
2017-03-14 23:17:33 +01:00
2017-10-28 12:09:25 +02:00
download example data
wget https://github.com/opencultureconsulting/openrefine-batch/archive/master.zip
unzip master.zip openrefine-batch-master/examples/*
mv openrefine-batch-master/examples .
rm -f master.zip
2017-12-11 21:57:48 +01:00
example 1 ( input, transform, export to tsv)
2017-10-28 12:09:25 +02:00
2019-08-06 21:21:59 +02:00
./openrefine-batch-docker.sh \
2017-03-14 23:17:33 +01:00
-a examples/powerhouse-museum/input/ \
-b examples/powerhouse-museum/config/ \
-c examples/powerhouse-museum/output/ \
-f tsv \
-i processQuotes = false \
2017-06-20 14:47:30 +02:00
-i guessCellValueTypes = true \
-RX
2017-03-14 23:17:33 +01:00
2017-12-11 21:57:48 +01:00
example 2 ( input, transform, templating export )
2019-08-06 21:21:59 +02:00
./openrefine-batch-docker.sh -a examples/powerhouse-museum/input/ -b examples/powerhouse-museum/config/ -c examples/powerhouse-museum/output/ -f tsv -i processQuotes = false -i guessCellValueTypes = true -RX -t template = '{ "Record ID" : {{jsonize(cells["Record ID"].value)}}, "Object Title" : {{jsonize(cells["Object Title"].value)}}, "Registration Number" : {{jsonize(cells["Registration Number"].value)}}, "Description." : {{jsonize(cells["Description."].value)}}, "Marks" : {{jsonize(cells["Marks"].value)}}, "Production Date" : {{jsonize(cells["Production Date"].value)}}, "Provenance (Production)" : {{jsonize(cells["Provenance (Production)"].value)}}, "Provenance (History)" : {{jsonize(cells["Provenance (History)"].value)}}, "Categories" : {{jsonize(cells["Categories"].value)}}, "Persistent Link" : {{jsonize(cells["Persistent Link"].value)}}, "Height" : {{jsonize(cells["Height"].value)}}, "Width" : {{jsonize(cells["Width"].value)}}, "Depth" : {{jsonize(cells["Depth"].value)}}, "Diameter" : {{jsonize(cells["Diameter"].value)}}, "Weight" : {{jsonize(cells["Weight"].value)}}, "License info" : {{jsonize(cells["License info"].value)}} }' -t rowSeparator = ',' -t prefix = '{ "rows" : [ ' -t suffix = '] }' -t splitToFiles = true
2017-03-14 23:17:33 +01:00
EOF
exit 1
}
# defaults
ram = "2048M"
2021-01-04 17:37:49 +01:00
version = "3.4.1"
2017-03-14 23:17:33 +01:00
restartfile = "true"
restarttransform = "true"
export = "true"
2017-10-28 00:47:51 +02:00
exportformat = "tsv"
2017-03-14 23:17:33 +01:00
inputdir = /dev/null
configdir = /dev/null
crossdir = /dev/null
# check input
NUMARGS = $#
if [ " $NUMARGS " -eq 0 ] ; then
usage
fi
# get user input
2017-12-11 21:57:48 +01:00
options = "a:b:c:d:e:f:i:m:t:v:ERXh"
2017-03-14 23:17:33 +01:00
while getopts $options opt; do
case $opt in
a ) inputdir = $( readlink -f ${ OPTARG } ) ; if [ -n " ${ inputdir // } " ] ; then inputfiles = ( $( find -L " ${ inputdir } " /* -type f -printf "%f\n" 2>/dev/null) ) ; fi ; ;
b ) configdir = $( readlink -f ${ OPTARG } ) ; if [ -n " ${ configdir // } " ] ; then jsonfiles = ( $( find -L " ${ configdir } " /* -type f -printf "%f\n" 2>/dev/null) ) ; fi ; ;
c ) outputdir = $( readlink -m ${ OPTARG } ) ; mkdir -p " ${ outputdir } " ; ;
d ) crossdir = $( readlink -f ${ OPTARG } ) ; if [ -n " ${ crossdir // } " ] ; then crossprojects = ( $( find -L " ${ crossdir } " /* -maxdepth 0 -type d -printf "%f\n" 2>/dev/null) ) ; fi ; ;
2017-10-28 00:47:51 +02:00
e ) format = " ${ OPTARG } " ; exportformat = " ${ OPTARG } " ; ;
2017-03-14 23:17:33 +01:00
f ) format = " ${ OPTARG } " ; inputformat = " --format= ${ OPTARG } " ; ;
i ) inputoptions += ( " -- ${ OPTARG } " ) ; ;
m ) ram = ${ OPTARG } ; ;
2017-12-11 21:57:48 +01:00
t ) templating += ( " -- ${ OPTARG } " ) ; exportformat = "txt" ; ;
2017-03-14 23:17:33 +01:00
v ) version = ${ OPTARG } ; ;
E ) export = "false" ; ;
R ) restarttransform = "false" ; ;
X ) restartfile = "false" ; ;
h ) usage ; ;
\? ) echo 1>& 2 " Unknown option: - $OPTARG " ; usage; exit 1; ;
: ) echo 1>& 2 " Missing option argument for - $OPTARG " ; usage; exit 1; ;
* ) echo 1>& 2 " Unimplemented option: - $OPTARG " ; usage; exit 1; ;
esac
done
2017-11-05 16:40:01 +01:00
shift $(( OPTIND - 1 ))
2017-03-14 23:17:33 +01:00
# check for mandatory options
if [ -z " $outputdir " ] ; then
echo 1>& 2 "please provide path to directory for exported files (and OpenRefine workspace)"
echo 1>& 2 "example: ./openrefine-batch-docker.sh -c output/"
exit 1
fi
2017-11-05 18:09:41 +01:00
if [ " $( ls -A " $outputdir " 2>/dev/null) " ] ; then
echo 1>& 2 "path to directory for exported files (and OpenRefine workspace) is not empty"
2017-11-07 21:24:14 +01:00
echo 1>& 2 " $outputdir "
2017-11-05 18:09:41 +01:00
exit 1
fi
2017-03-14 23:17:33 +01:00
if [ " $format " = "xml" ] || [ " $format " = "json" ] && [ -z " $inputoptions " ] ; then
echo 1>& 2 " error: you specified the inputformat $format but did not provide mandatory input options "
echo 1>& 2 "please provide recordpath in multiple arguments without slashes"
echo 1>& 2 " example: ./openrefine-batch-docker.sh ... -f $format -i recordPath=collection -i recordPath=record "
exit 1
fi
if [ " $format " = "fixed-width" ] && [ -z " $inputoptions " ] ; then
echo 1>& 2 " error: you specified the inputformat $format but did not provide mandatory input options "
echo 1>& 2 "please provide column widths separated by comma (e.g. 7,5)"
echo 1>& 2 " example: ./openrefine-batch-docker.sh ... -f $format -i columnWidths=7,5 "
exit 1
fi
if [ " $format " = "xlsx" ] || [ " $format " = "ods" ] && [ -z " $inputoptions " ] ; then
echo 1>& 2 " error: you specified the inputformat $format but did not provide mandatory input options "
echo 1>& 2 "please provide sheets separated by comma (e.g. 0,1), default: 0 (first sheet)"
echo 1>& 2 " example: ./openrefine-batch-docker.sh ... -f $format -i sheets=0 "
exit 1
fi
# print variables
uuid = $( cat /proc/sys/kernel/random/uuid)
echo " Input directory: $inputdir "
echo " Input files: ${ inputfiles [*] } "
echo " Input format: $inputformat "
echo " Input options: ${ inputoptions [*] } "
echo " Config directory: $configdir "
echo " Transformation rules: ${ jsonfiles [*] } "
echo " Cross directory: $crossdir "
echo " Cross projects: ${ crossprojects [*] } "
echo " OpenRefine heap space: $ram "
echo " OpenRefine version: $version "
echo " OpenRefine workspace: $outputdir "
2017-10-28 00:47:51 +02:00
echo " Export to workspace: $export "
echo " Export format: $exportformat "
2017-12-11 21:57:48 +01:00
echo " Templating options: ${ templating [*] } "
2017-03-14 23:17:33 +01:00
echo " Docker container name: $uuid "
echo " restart after file: $restartfile "
echo " restart after transform: $restarttransform "
echo ""
# declare additional variables
checkpoints = ${# checkpointdate [@] }
2017-11-05 16:25:42 +01:00
checkpointdate[ $(( checkpoints + 1 )) ] = $( date +%s)
checkpointname[ $(( checkpoints + 1 )) ] = "Start process"
2017-06-20 14:47:30 +02:00
memoryload = ( )
2017-03-14 23:17:33 +01:00
2017-11-05 18:09:41 +01:00
# safe cleanup handler
cleanup( )
{
echo "cleanup..."
2019-08-06 21:21:59 +02:00
${ docker [*] } stop -t= 5000 ${ uuid }
${ docker [*] } rm ${ uuid }
2017-11-05 18:09:41 +01:00
rm -r -f " ${ outputdir : ? } " /workspace*.json
# delete duplicates from copied projects
if [ -n " $crossprojects " ] ; then
for i in " ${ crossprojects [@] } " ; do rm -r -f " ${ outputdir } / ${ i } " ; done
fi
}
2017-11-07 21:24:14 +01:00
trap "cleanup;exit" SIGHUP SIGINT SIGQUIT SIGTERM
2017-11-05 18:09:41 +01:00
2017-03-14 23:17:33 +01:00
# launch server
checkpoints = ${# checkpointdate [@] }
2017-11-05 16:25:42 +01:00
checkpointdate[ $(( checkpoints + 1 )) ] = $( date +%s)
checkpointname[ $(( checkpoints + 1 )) ] = "Launch OpenRefine"
echo " === $checkpoints . ${ checkpointname [ $(( checkpoints + 1 )) ] } === "
2017-03-14 23:17:33 +01:00
echo ""
2017-11-05 16:25:42 +01:00
echo " starting time: $( date --date= @${ checkpointdate [ $(( checkpoints + 1 )) ] } ) "
2017-03-14 23:17:33 +01:00
echo ""
2019-08-06 21:21:59 +02:00
${ docker [*] } run -d --name= ${ uuid } -v ${ outputdir } :/data:z felixlohmeier/openrefine:${ version } -i 0.0.0.0 -m ${ ram } -d /data
2017-03-14 23:17:33 +01:00
# wait until server is available
2021-01-04 17:37:49 +01:00
until ${ docker [*] } run --rm --link ${ uuid } --entrypoint /usr/bin/curl felixlohmeier/openrefine-client:v0.3.10 --silent -N http://${ uuid } :3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done
2017-03-14 23:17:33 +01:00
# show server logs
2019-08-06 21:21:59 +02:00
${ docker [*] } attach ${ uuid } &
2017-03-14 23:17:33 +01:00
echo ""
# import all files
if [ -n " $inputfiles " ] ; then
checkpoints = ${# checkpointdate [@] }
2017-11-05 16:25:42 +01:00
checkpointdate[ $(( checkpoints + 1 )) ] = $( date +%s)
checkpointname[ $(( checkpoints + 1 )) ] = "Import all files"
echo " === $checkpoints . ${ checkpointname [ $(( checkpoints + 1 )) ] } === "
2017-03-14 23:17:33 +01:00
echo ""
2017-11-05 16:25:42 +01:00
echo " starting time: $( date --date= @${ checkpointdate [ $(( checkpoints + 1 )) ] } ) "
2017-03-14 23:17:33 +01:00
echo ""
for inputfile in " ${ inputfiles [@] } " ; do
echo " import ${ inputfile } ... "
# run client with input command
2021-01-04 17:37:49 +01:00
${ docker [*] } run --rm --link ${ uuid } -v ${ inputdir } :/data:z felixlohmeier/openrefine-client:v0.3.10 -H ${ uuid } -c $inputfile $inputformat ${ inputoptions [@] }
2017-03-14 23:17:33 +01:00
# show allocated system resources
ps -o start,etime,%mem,%cpu,rss -C java --sort= start
2017-06-20 14:47:30 +02:00
memoryload += ( $( ps --no-headers -o rss -C java) )
2017-03-14 23:17:33 +01:00
echo ""
# restart server to clear memory
if [ " $restartfile " = "true" ] ; then
echo "save project and restart OpenRefine server..."
2019-08-06 21:21:59 +02:00
${ docker [*] } stop -t= 5000 ${ uuid }
${ docker [*] } rm ${ uuid }
${ docker [*] } run -d --name= ${ uuid } -v ${ outputdir } :/data:z felixlohmeier/openrefine:${ version } -i 0.0.0.0 -m ${ ram } -d /data
2021-01-04 17:37:49 +01:00
until ${ docker [*] } run --rm --link ${ uuid } --entrypoint /usr/bin/curl felixlohmeier/openrefine-client:v0.3.10 --silent -N http://${ uuid } :3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done
2019-08-06 21:21:59 +02:00
${ docker [*] } attach ${ uuid } &
2017-03-14 23:17:33 +01:00
echo ""
fi
done
fi
# transform and export files
if [ -n " $jsonfiles " ] || [ " $export " = "true" ] ; then
checkpoints = ${# checkpointdate [@] }
2017-11-05 16:25:42 +01:00
checkpointdate[ $(( checkpoints + 1 )) ] = $( date +%s)
checkpointname[ $(( checkpoints + 1 )) ] = "Prepare transform & export"
echo " === $checkpoints . ${ checkpointname [ $(( checkpoints + 1 )) ] } === "
2017-03-14 23:17:33 +01:00
echo ""
2017-11-05 16:25:42 +01:00
echo " starting time: $( date --date= @${ checkpointdate [ $(( checkpoints + 1 )) ] } ) "
2017-03-14 23:17:33 +01:00
echo ""
# get project ids
echo "get project ids..."
2021-01-04 17:37:49 +01:00
${ docker [*] } run --rm --link ${ uuid } felixlohmeier/openrefine-client:v0.3.10 -H ${ uuid } -l > " ${ outputdir } /projects.tmp "
2017-11-05 16:25:42 +01:00
projectids = ( $( cut -c 2-14 " ${ outputdir } /projects.tmp " ) )
projectnames = ( $( cut -c 17- " ${ outputdir } /projects.tmp " ) )
2017-03-14 23:17:33 +01:00
cat " ${ outputdir } /projects.tmp " && rm " ${ outputdir : ? } /projects.tmp "
echo ""
# provide additional OpenRefine projects for cross function
if [ -n " $crossprojects " ] ; then
echo "provide additional projects for cross function..."
# copy given projects to workspace
rsync -a --exclude= '*.project/history' " ${ crossdir } " /*.project " ${ outputdir } "
# restart server to advertise copied projects
echo "restart OpenRefine server to advertise copied projects..."
2019-08-06 21:21:59 +02:00
${ docker [*] } stop -t= 5000 ${ uuid }
${ docker [*] } rm ${ uuid }
${ docker [*] } run -d --name= ${ uuid } -v ${ outputdir } :/data:z felixlohmeier/openrefine:${ version } -i 0.0.0.0 -m ${ ram } -d /data
2021-01-04 17:37:49 +01:00
until ${ docker [*] } run --rm --link ${ uuid } --entrypoint /usr/bin/curl felixlohmeier/openrefine-client:v0.3.10 --silent -N http://${ uuid } :3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done
2019-08-06 21:21:59 +02:00
${ docker [*] } attach ${ uuid } &
2017-03-14 23:17:33 +01:00
echo ""
fi
# loop for all projects
for ( ( i = 0; i<${# projectids [@] } ; ++i) ) ; do
# apply transformation rules
if [ -n " $jsonfiles " ] ; then
checkpoints = ${# checkpointdate [@] }
2017-11-05 16:25:42 +01:00
checkpointdate[ $(( checkpoints + 1 )) ] = $( date +%s)
checkpointname[ $(( checkpoints + 1 )) ] = " Transform ${ projectnames [i] } "
echo " === $checkpoints . ${ checkpointname [ $(( checkpoints + 1 )) ] } === "
2017-03-14 23:17:33 +01:00
echo ""
2017-11-05 16:25:42 +01:00
echo " starting time: $( date --date= @${ checkpointdate [ $(( checkpoints + 1 )) ] } ) "
2017-03-14 23:17:33 +01:00
echo ""
for jsonfile in " ${ jsonfiles [@] } " ; do
echo " transform ${ jsonfile } ... "
# run client with apply command
2021-01-04 17:37:49 +01:00
${ docker [*] } run --rm --link ${ uuid } -v ${ configdir } :/data:z felixlohmeier/openrefine-client:v0.3.10 -H ${ uuid } -f ${ jsonfile } ${ projectids [i] }
2017-03-14 23:17:33 +01:00
# allocated system resources
ps -o start,etime,%mem,%cpu,rss -C java --sort= start
2017-06-20 14:47:30 +02:00
memoryload += ( $( ps --no-headers -o rss -C java) )
2017-03-14 23:17:33 +01:00
echo ""
# restart server to clear memory
if [ " $restarttransform " = "true" ] ; then
echo "save project and restart OpenRefine server..."
2019-08-06 21:21:59 +02:00
${ docker [*] } stop -t= 5000 ${ uuid }
${ docker [*] } rm ${ uuid }
${ docker [*] } run -d --name= ${ uuid } -v ${ outputdir } :/data:z felixlohmeier/openrefine:${ version } -i 0.0.0.0 -m ${ ram } -d /data
2021-01-04 17:37:49 +01:00
until ${ docker [*] } run --rm --link ${ uuid } --entrypoint /usr/bin/curl felixlohmeier/openrefine-client:v0.3.10 --silent -N http://${ uuid } :3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done
2019-08-06 21:21:59 +02:00
${ docker [*] } attach ${ uuid } &
2017-03-14 23:17:33 +01:00
fi
echo ""
done
fi
# export project to workspace
if [ " $export " = "true" ] ; then
checkpoints = ${# checkpointdate [@] }
2017-11-05 16:25:42 +01:00
checkpointdate[ $(( checkpoints + 1 )) ] = $( date +%s)
checkpointname[ $(( checkpoints + 1 )) ] = " Export ${ projectnames [i] } "
echo " === $checkpoints . ${ checkpointname [ $(( checkpoints + 1 )) ] } === "
2017-03-14 23:17:33 +01:00
echo ""
2017-11-05 16:25:42 +01:00
echo " starting time: $( date --date= @${ checkpointdate [ $(( checkpoints + 1 )) ] } ) "
2017-03-14 23:17:33 +01:00
echo ""
# get filename without extension
filename = ${ projectnames [i]%.* }
2017-10-28 00:47:51 +02:00
echo " export to file ${ filename } . ${ exportformat } ... "
2017-03-14 23:17:33 +01:00
# run client with export command
2021-01-04 17:37:49 +01:00
${ docker [*] } run --rm --link ${ uuid } -v ${ outputdir } :/data:z felixlohmeier/openrefine-client:v0.3.10 -H ${ uuid } -E --output= " ${ filename } . ${ exportformat } " " ${ templating [@] } " ${ projectids [i] }
2017-03-14 23:17:33 +01:00
# show allocated system resources
ps -o start,etime,%mem,%cpu,rss -C java --sort= start
2017-06-20 14:47:30 +02:00
memoryload += ( $( ps --no-headers -o rss -C java) )
2017-03-14 23:17:33 +01:00
echo ""
fi
# restart server to clear memory
if [ " $restartfile " = "true" ] ; then
echo "restart OpenRefine server..."
2019-08-06 21:21:59 +02:00
${ docker [*] } stop -t= 5000 ${ uuid }
${ docker [*] } rm ${ uuid }
${ docker [*] } run -d --name= ${ uuid } -v ${ outputdir } :/data:z felixlohmeier/openrefine:${ version } -i 0.0.0.0 -m ${ ram } -d /data
2021-01-04 17:37:49 +01:00
until ${ docker [*] } run --rm --link ${ uuid } --entrypoint /usr/bin/curl felixlohmeier/openrefine-client:v0.3.10 --silent -N http://${ uuid } :3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done
2019-08-06 21:21:59 +02:00
${ docker [*] } attach ${ uuid } &
2017-03-14 23:17:33 +01:00
fi
echo ""
done
# list output files
if [ " $export " = "true" ] ; then
echo "output (number of lines / size in bytes):"
2017-10-28 00:47:51 +02:00
wc -c -l " ${ outputdir } " /*.${ exportformat }
2017-03-14 23:17:33 +01:00
echo ""
fi
fi
2017-11-07 21:24:14 +01:00
# run cleanup function
cleanup
echo ""
2017-03-14 23:17:33 +01:00
# calculate and print checkpoints
echo "=== Statistics ==="
echo ""
checkpoints = ${# checkpointdate [@] }
2017-11-05 16:25:42 +01:00
checkpointdate[ $(( checkpoints + 1 )) ] = $( date +%s)
checkpointname[ $(( checkpoints + 1 )) ] = "End process"
2017-03-14 23:17:33 +01:00
echo "starting time and run time of each step:"
checkpoints = ${# checkpointdate [@] }
2017-11-05 16:25:42 +01:00
checkpointdate[ $(( checkpoints + 1 )) ] = $( date +%s)
2017-03-14 23:17:33 +01:00
for i in $( seq 1 $checkpoints ) ; do
2017-11-05 16:40:01 +01:00
diffsec = " $(( ${ checkpointdate [ $(( i + 1 )) ] } - ${ checkpointdate [ $i ] } )) "
2017-03-14 23:17:33 +01:00
printf " %35s $( date --date= @${ checkpointdate [ $i ] } ) ( $( date -d@${ diffsec } -u +%H:%M:%S) )\n " " ${ checkpointname [ $i ] } "
done
echo ""
2017-11-05 16:40:01 +01:00
diffsec = " $(( checkpointdate[ $checkpoints ] - checkpointdate[ 1 ] )) "
2017-03-14 23:17:33 +01:00
echo " total run time: $( date -d@${ diffsec } -u +%H:%M:%S) (hh:mm:ss) "
2017-06-20 14:47:30 +02:00
# calculate and print memory load
max = ${ memoryload [0] }
for n in " ${ memoryload [@] } " ; do
( ( n > max) ) && max = $n
done
2017-11-05 16:40:01 +01:00
echo " highest memory load: $(( max / 1024 )) MB "