2017-02-27 00:47:34 +01:00
#!/bin/bash
2017-12-11 21:57:48 +01:00
# openrefine-batch.sh, Felix Lohmeier, v1.11, 2017-12-11
2017-02-27 01:03:33 +01:00
# https://github.com/felixlohmeier/openrefine-batch
2017-02-27 00:47:34 +01:00
2017-03-14 23:17:33 +01:00
# declare download URLs for OpenRefine and OpenRefine client
2017-10-28 12:09:25 +02:00
openrefine_URL = "https://github.com/opencultureconsulting/openrefine-batch/raw/master/src/openrefine-linux-2017-10-28.tar.gz"
2017-12-11 21:57:48 +01:00
client_URL = "https://github.com/opencultureconsulting/openrefine-batch/raw/master/src/openrefine-client_0-3-4_linux-64bit"
2017-03-14 23:17:33 +01:00
# check system requirements
JAVA = " $( which java 2> /dev/null) "
if [ -z " $JAVA " ] ; then
echo 1>& 2 "This action requires you to have 'Java JRE' installed. You can download it for free at https://java.com"
exit 1
2017-02-27 00:47:34 +01:00
fi
2017-03-15 13:08:08 +01:00
# check if wget supports option --show-progress (since wget 1.16)
wget --help | grep -q '\--show-progress' && wget_opt = "--show-progress" || wget_opt = ""
2017-03-14 23:17:33 +01:00
# autoinstall OpenRefine
if [ ! -d "openrefine" ] ; then
echo "Download OpenRefine..."
mkdir -p openrefine
2017-03-15 13:08:08 +01:00
wget -q $wget_opt $openrefine_URL
2017-03-14 23:17:33 +01:00
echo "Install OpenRefine in subdirectory openrefine..."
tar -xzf " $( basename $openrefine_URL ) " -C openrefine --strip 1 --totals
rm -f " $( basename $openrefine_URL ) "
sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' openrefine/refine.ini
2017-08-02 14:16:45 +02:00
sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1440/' openrefine/refine.ini
2017-06-22 13:38:42 +02:00
sed -i 's/-Xms$REFINE_MIN_MEMORY/-Xms$REFINE_MEMORY/' openrefine/refine
2017-03-14 23:17:33 +01:00
echo ""
2017-02-27 00:47:34 +01:00
fi
2017-03-14 23:17:33 +01:00
# autoinstall OpenRefine client
if [ ! -d "openrefine-client" ] ; then
echo "Download OpenRefine client..."
mkdir -p openrefine-client
2017-08-02 14:16:45 +02:00
wget -q -P openrefine-client $wget_opt $client_URL
2017-12-11 21:57:48 +01:00
chmod +x openrefine-client/openrefine-client_0-3-4_linux-64bit
2017-03-14 23:17:33 +01:00
echo ""
2017-02-27 03:48:43 +01:00
fi
2017-03-14 23:17:33 +01:00
# help screen
function usage ( ) {
cat <<EOF
Usage: ./openrefine-batch.sh [ -a INPUTDIR] [ -b TRANSFORMDIR] [ -c OUTPUTDIR] ...
= = basic arguments = =
-a INPUTDIR path to directory with source files ( leave empty to transform only ; multiple files may be imported into a single project by providing a zip or tar.gz archive, cf. https://github.com/OpenRefine/OpenRefine/wiki/Importers )
-b TRANSFORMDIR path to directory with OpenRefine transformation rules ( json files, cf. http://kb.refinepro.com/2012/06/google-refine-json-and-my-notepad-or.html ; leave empty to transform only)
-c OUTPUTDIR path to directory for exported files ( and OpenRefine workspace)
= = options = =
-d CROSSDIR path to directory with additional OpenRefine projects ( will be copied to workspace before transformation step to support the cross function , cf. https://github.com/OpenRefine/OpenRefine/wiki/GREL-Other-Functions )
2017-10-28 00:47:51 +02:00
-e EXPORTFORMAT ( csv, tsv, html, xls, xlsx, ods)
2017-03-14 23:17:33 +01:00
-f INPUTFORMAT ( csv, tsv, xml, json, line-based, fixed-width, xlsx, ods)
-i INPUTOPTIONS several options provided by openrefine-client, see below...
-m RAM maximum RAM for OpenRefine java heap space ( default: 2048M)
-p PORT PORT on which OpenRefine should listen ( default: 3333)
2017-12-11 21:57:48 +01:00
-t TEMPLATING several options for templating export, see below...
2017-03-14 23:17:33 +01:00
-E do NOT export files
-R do NOT restart OpenRefine after each transformation ( e.g. config file)
-X do NOT restart OpenRefine after each project ( e.g. input file)
-h displays this help screen
= = inputoptions ( mandatory for xml, json, fixed-width, xslx, ods) = =
2017-12-11 21:57:48 +01:00
-i recordPath = RECORDPATH ( xml, json) : please provide path in multiple arguments without slashes, e.g. /collection/record/ should be entered like this: -i recordPath = collection -i recordPath = record, default xml: record, default json: _ _
2017-03-14 23:17:33 +01:00
-i columnWidths = COLUMNWIDTHS ( fixed-width) : please provide widths separated by comma ( e.g. 7,5)
2017-12-11 21:57:48 +01:00
-i sheets = SHEETS ( xls, xlsx, ods) : please provide sheets separated by comma ( e.g. 0,1) , default: 0 ( first sheet)
2017-03-14 23:17:33 +01:00
= = more inputoptions ( optional, only together with inputformat) = =
2017-12-11 21:57:48 +01:00
-i projectName = PROJECTNAME ( all formats) , default: filename
2017-03-14 23:17:33 +01:00
-i limit = LIMIT ( all formats) , default: -1
2017-12-11 21:57:48 +01:00
-i includeFileSources = true/false ( all formats) , default: false
-i trimStrings = true/false ( xml, json) , default: false
-i storeEmptyStrings = true/false ( xml, json) , default: true
-i guessCellValueTypes = true/false ( xml, csv, tsv, fixed-width, json) , default: false
2017-03-14 23:17:33 +01:00
-i encoding = ENCODING ( csv, tsv, line-based, fixed-width) , please provide short encoding name ( e.g. UTF-8)
2017-12-11 21:57:48 +01:00
-i ignoreLines = IGNORELINES ( csv, tsv, line-based, fixed-width, xls, xlsx, ods) , default: -1
-i headerLines = HEADERLINES ( csv, tsv, fixed-width, xls, xlsx, ods) , default: 1, default fixed-width: 0
-i skipDataLines = true/false ( csv, tsv, line-based, fixed-width, xls, xlsx, ods) , default: 0, default line-based: -1
-i storeBlankRows = true/false ( csv, tsv, line-based, fixed-width, xls, xlsx, ods) , default: true
-i processQuotes = true/false ( csv, tsv) , default: true
-i storeBlankCellsAsNulls = true/false ( csv, tsv, line-based, fixed-width, xls, xlsx, ods) , default: true
2017-03-14 23:17:33 +01:00
-i linesPerRow = LINESPERROW ( line-based) , default: 1
2017-12-11 21:57:48 +01:00
= = templating options ( alternative exportformat) = =
-t template = TEMPLATE ( mandatory; ( big) text string that you enter in the *row template* textfield in the export/templating menu in the browser app)
-t mode = row-based/record-based ( engine mode, default: row-based)
-t prefix = PREFIX ( text string that you enter in the *prefix* textfield in the browser app)
-t rowSeparator = ROWSEPARATOR ( text string that you enter in the *row separator* textfield in the browser app)
-t suffix = SUFFIX ( text string that you enter in the *suffix* textfield in the browser app)
-t filterQuery = REGEX ( Simple RegEx text filter on filterColumn, e.g. ^12015$)
-t filterColumn = COLUMNNAME ( column name for filterQuery, default: name of first column)
-t facets = FACETS ( facets config in json format, may be extracted with browser dev tools in browser app)
-t splitToFiles = true/false ( will split each row/record into a single file; it specifies a presumably unique character series for splitting; prefix and suffix will be applied to all files
-t suffixById = true/false ( enhancement option for splitToFiles; will generate filename-suffix from values in key column)
= = examples = =
2017-03-14 23:17:33 +01:00
2017-10-28 12:09:25 +02:00
download example data
wget https://github.com/opencultureconsulting/openrefine-batch/archive/master.zip
unzip master.zip openrefine-batch-master/examples/*
mv openrefine-batch-master/examples .
rm -f master.zip
2017-12-11 21:57:48 +01:00
example 1 ( input, transform, export to tsv)
2017-10-28 12:09:25 +02:00
2017-03-14 23:17:33 +01:00
./openrefine-batch.sh \
-a examples/powerhouse-museum/input/ \
-b examples/powerhouse-museum/config/ \
-c examples/powerhouse-museum/output/ \
-f tsv \
-i processQuotes = false \
2017-06-20 14:47:30 +02:00
-i guessCellValueTypes = true \
-RX
2017-03-14 23:17:33 +01:00
2017-12-11 21:57:48 +01:00
example 2 ( input, transform, templating export )
./openrefine-batch.sh -a examples/powerhouse-museum/input/ -b examples/powerhouse-museum/config/ -c examples/powerhouse-museum/output/ -f tsv -i processQuotes = false -i guessCellValueTypes = true -RX -t template = '{ "Record ID" : {{jsonize(cells["Record ID"].value)}}, "Object Title" : {{jsonize(cells["Object Title"].value)}}, "Registration Number" : {{jsonize(cells["Registration Number"].value)}}, "Description." : {{jsonize(cells["Description."].value)}}, "Marks" : {{jsonize(cells["Marks"].value)}}, "Production Date" : {{jsonize(cells["Production Date"].value)}}, "Provenance (Production)" : {{jsonize(cells["Provenance (Production)"].value)}}, "Provenance (History)" : {{jsonize(cells["Provenance (History)"].value)}}, "Categories" : {{jsonize(cells["Categories"].value)}}, "Persistent Link" : {{jsonize(cells["Persistent Link"].value)}}, "Height" : {{jsonize(cells["Height"].value)}}, "Width" : {{jsonize(cells["Width"].value)}}, "Depth" : {{jsonize(cells["Depth"].value)}}, "Diameter" : {{jsonize(cells["Diameter"].value)}}, "Weight" : {{jsonize(cells["Weight"].value)}}, "License info" : {{jsonize(cells["License info"].value)}} }' -t rowSeparator = ',' -t prefix = '{ "rows" : [ ' -t suffix = '] }' -t splitToFiles = true
2017-03-14 23:17:33 +01:00
EOF
exit 1
}
# defaults
ram = "2048M"
port = "3333"
restartfile = "true"
restarttransform = "true"
export = "true"
2017-10-28 00:47:51 +02:00
exportformat = "tsv"
2017-03-14 23:17:33 +01:00
inputdir = /dev/null
configdir = /dev/null
crossdir = /dev/null
2017-10-28 00:47:51 +02:00
2017-03-14 23:17:33 +01:00
# check input
NUMARGS = $#
if [ " $NUMARGS " -eq 0 ] ; then
usage
2017-02-27 17:38:25 +01:00
fi
2017-03-14 23:17:33 +01:00
# get user input
2017-12-11 21:57:48 +01:00
options = "a:b:c:d:e:f:i:m:p:t:ERXh"
2017-03-14 23:17:33 +01:00
while getopts $options opt; do
case $opt in
a ) inputdir = $( readlink -f ${ OPTARG } ) ; if [ -n " ${ inputdir // } " ] ; then inputfiles = ( $( find -L " ${ inputdir } " /* -type f -printf "%f\n" 2>/dev/null) ) ; fi ; ;
b ) configdir = $( readlink -f ${ OPTARG } ) ; if [ -n " ${ configdir // } " ] ; then jsonfiles = ( $( find -L " ${ configdir } " /* -type f -printf "%f\n" 2>/dev/null) ) ; fi ; ;
c ) outputdir = $( readlink -m ${ OPTARG } ) ; mkdir -p " ${ outputdir } " ; ;
d ) crossdir = $( readlink -f ${ OPTARG } ) ; if [ -n " ${ crossdir // } " ] ; then crossprojects = ( $( find -L " ${ crossdir } " /* -maxdepth 0 -type d -printf "%f\n" 2>/dev/null) ) ; fi ; ;
2017-10-28 00:47:51 +02:00
e ) format = " ${ OPTARG } " ; exportformat = " ${ OPTARG } " ; ;
2017-03-14 23:17:33 +01:00
f ) format = " ${ OPTARG } " ; inputformat = " --format= ${ OPTARG } " ; ;
i ) inputoptions += ( " -- ${ OPTARG } " ) ; ;
m ) ram = ${ OPTARG } ; ;
p ) port = ${ OPTARG } ; ;
2017-12-11 21:57:48 +01:00
t ) templating += ( " -- ${ OPTARG } " ) ; exportformat = "txt" ; ;
2017-03-14 23:17:33 +01:00
E ) export = "false" ; ;
R ) restarttransform = "false" ; ;
X ) restartfile = "false" ; ;
h ) usage ; ;
\? ) echo 1>& 2 " Unknown option: - $OPTARG " ; usage; exit 1; ;
: ) echo 1>& 2 " Missing option argument for - $OPTARG " ; usage; exit 1; ;
* ) echo 1>& 2 " Unimplemented option: - $OPTARG " ; usage; exit 1; ;
esac
done
2017-11-05 16:40:01 +01:00
shift $(( OPTIND - 1 ))
2017-03-14 23:17:33 +01:00
# check for mandatory options
if [ -z " $outputdir " ] ; then
echo 1>& 2 "please provide path to directory for exported files (and OpenRefine workspace)"
echo 1>& 2 "example: ./openrefine-batch.sh -c output/"
exit 1
2017-02-27 18:12:06 +01:00
fi
2017-11-05 18:09:41 +01:00
if [ " $( ls -A " $outputdir " 2>/dev/null) " ] ; then
echo 1>& 2 "path to directory for exported files (and OpenRefine workspace) is not empty"
2017-11-07 21:24:14 +01:00
echo 1>& 2 " $outputdir "
2017-11-05 18:09:41 +01:00
exit 1
fi
2017-03-14 23:17:33 +01:00
if [ " $format " = "xml" ] || [ " $format " = "json" ] && [ -z " $inputoptions " ] ; then
echo 1>& 2 " error: you specified the inputformat $format but did not provide mandatory input options "
echo 1>& 2 "please provide recordpath in multiple arguments without slashes"
echo 1>& 2 " example: ./openrefine-batch.sh ... -f $format -i recordPath=collection -i recordPath=record "
exit 1
2017-03-01 17:48:13 +01:00
fi
2017-03-14 23:17:33 +01:00
if [ " $format " = "fixed-width" ] && [ -z " $inputoptions " ] ; then
echo 1>& 2 " error: you specified the inputformat $format but did not provide mandatory input options "
echo 1>& 2 "please provide column widths separated by comma (e.g. 7,5)"
echo 1>& 2 " example: ./openrefine-batch.sh ... -f $format -i columnWidths=7,5 "
exit 1
2017-03-01 17:48:13 +01:00
fi
2017-03-14 23:17:33 +01:00
if [ " $format " = "xlsx" ] || [ " $format " = "ods" ] && [ -z " $inputoptions " ] ; then
echo 1>& 2 " error: you specified the inputformat $format but did not provide mandatory input options "
echo 1>& 2 "please provide sheets separated by comma (e.g. 0,1), default: 0 (first sheet)"
echo 1>& 2 " example: ./openrefine-batch.sh ... -f $format -i sheets=0 "
exit 1
2017-02-27 00:47:34 +01:00
fi
2017-03-14 23:17:33 +01:00
# print variables
2017-03-01 17:48:13 +01:00
echo " Input directory: $inputdir "
2017-03-14 23:17:33 +01:00
echo " Input files: ${ inputfiles [*] } "
2017-03-01 17:48:13 +01:00
echo " Input format: $inputformat "
2017-03-14 23:17:33 +01:00
echo " Input options: ${ inputoptions [*] } "
2017-03-01 17:48:13 +01:00
echo " Config directory: $configdir "
2017-03-14 23:17:33 +01:00
echo " Transformation rules: ${ jsonfiles [*] } "
2017-03-01 17:48:13 +01:00
echo " Cross directory: $crossdir "
2017-03-14 23:17:33 +01:00
echo " Cross projects: ${ crossprojects [*] } "
2017-03-01 17:48:13 +01:00
echo " OpenRefine heap space: $ram "
2017-03-14 23:17:33 +01:00
echo " OpenRefine port: $port "
2017-03-01 17:48:13 +01:00
echo " OpenRefine workspace: $outputdir "
2017-10-28 00:47:51 +02:00
echo " Export to workspace: $export "
echo " Export format: $exportformat "
2017-12-11 21:57:48 +01:00
echo " Templating options: ${ templating [*] } "
2017-03-01 17:48:13 +01:00
echo " restart after file: $restartfile "
echo " restart after transform: $restarttransform "
2017-02-27 00:47:34 +01:00
echo ""
2017-03-14 23:17:33 +01:00
# declare additional variables
checkpoints = ${# checkpointdate [@] }
2017-11-05 16:25:42 +01:00
checkpointdate[ $(( checkpoints + 1 )) ] = $( date +%s)
checkpointname[ $(( checkpoints + 1 )) ] = "Start process"
2017-03-14 23:17:33 +01:00
memoryload = ( )
2017-02-27 00:47:34 +01:00
2017-11-05 18:09:41 +01:00
# safe cleanup handler
cleanup( )
{
echo "cleanup..."
kill ${ pid }
wait
rm -r -f " ${ outputdir : ? } " /workspace*.json
# delete duplicates from copied projects
if [ -n " $crossprojects " ] ; then
for i in " ${ crossprojects [@] } " ; do rm -r -f " ${ outputdir } / ${ i } " ; done
fi
}
2017-11-07 21:24:14 +01:00
trap "cleanup;exit" SIGHUP SIGINT SIGQUIT SIGTERM
2017-11-05 18:09:41 +01:00
2017-03-01 17:48:13 +01:00
# launch server
2017-03-14 23:17:33 +01:00
checkpoints = ${# checkpointdate [@] }
2017-11-05 16:25:42 +01:00
checkpointdate[ $(( checkpoints + 1 )) ] = $( date +%s)
checkpointname[ $(( checkpoints + 1 )) ] = "Launch OpenRefine"
echo " === $checkpoints . ${ checkpointname [ $(( checkpoints + 1 )) ] } === "
2017-03-14 23:17:33 +01:00
echo ""
2017-11-05 16:25:42 +01:00
echo " starting time: $( date --date= @${ checkpointdate [ $(( checkpoints + 1 )) ] } ) "
2017-03-14 23:17:33 +01:00
echo ""
openrefine/refine -p ${ port } -d " ${ outputdir } " -m ${ ram } &
pid = $!
2017-03-01 17:48:13 +01:00
# wait until server is available
2017-03-14 23:17:33 +01:00
until wget -q -O - http://localhost:${ port } | cat | grep -q -o "OpenRefine" ; do sleep 1; done
2017-02-27 00:47:34 +01:00
echo ""
2017-03-01 17:48:13 +01:00
# import all files
2017-02-27 03:48:43 +01:00
if [ -n " $inputfiles " ] ; then
2017-03-14 23:17:33 +01:00
checkpoints = ${# checkpointdate [@] }
2017-11-05 16:25:42 +01:00
checkpointdate[ $(( checkpoints + 1 )) ] = $( date +%s)
checkpointname[ $(( checkpoints + 1 )) ] = "Import all files"
echo " === $checkpoints . ${ checkpointname [ $(( checkpoints + 1 )) ] } === "
2017-03-14 23:17:33 +01:00
echo ""
2017-11-05 16:25:42 +01:00
echo " starting time: $( date --date= @${ checkpointdate [ $(( checkpoints + 1 )) ] } ) "
2017-03-14 23:17:33 +01:00
echo ""
2017-02-27 03:48:43 +01:00
for inputfile in " ${ inputfiles [@] } " ; do
echo " import ${ inputfile } ... "
2017-03-01 17:48:13 +01:00
# run client with input command
2017-12-11 21:57:48 +01:00
openrefine-client/openrefine-client_0-3-4_linux-64bit -P ${ port } -c ${ inputdir } /${ inputfile } $inputformat " ${ inputoptions [@] } "
2017-03-14 23:17:33 +01:00
# show allocated system resources
ps -o start,etime,%mem,%cpu,rss -p ${ pid } --sort= start
memoryload += ( $( ps --no-headers -o rss -p ${ pid } ) )
2017-02-27 03:48:43 +01:00
echo ""
2017-03-01 17:48:13 +01:00
# restart server to clear memory
2017-03-14 23:17:33 +01:00
if [ " $restartfile " = "true" ] ; then
2017-03-01 17:48:13 +01:00
echo "save project and restart OpenRefine server..."
2017-03-14 23:17:33 +01:00
kill ${ pid }
wait
echo ""
openrefine/refine -p ${ port } -d " ${ outputdir } " -m ${ ram } &
pid = $!
until wget -q -O - http://localhost:${ port } | cat | grep -q -o "OpenRefine" ; do sleep 1; done
2017-03-01 17:48:13 +01:00
echo ""
fi
2017-02-27 03:48:43 +01:00
done
fi
2017-02-27 00:47:34 +01:00
2017-03-01 22:59:30 +01:00
# transform and export files
2017-03-14 23:17:33 +01:00
if [ -n " $jsonfiles " ] || [ " $export " = "true" ] ; then
checkpoints = ${# checkpointdate [@] }
2017-11-05 16:25:42 +01:00
checkpointdate[ $(( checkpoints + 1 )) ] = $( date +%s)
checkpointname[ $(( checkpoints + 1 )) ] = "Prepare transform & export"
echo " === $checkpoints . ${ checkpointname [ $(( checkpoints + 1 )) ] } === "
2017-03-14 23:17:33 +01:00
echo ""
2017-11-05 16:25:42 +01:00
echo " starting time: $( date --date= @${ checkpointdate [ $(( checkpoints + 1 )) ] } ) "
2017-03-14 23:17:33 +01:00
echo ""
# get project ids
echo "get project ids..."
2017-12-11 21:57:48 +01:00
openrefine-client/openrefine-client_0-3-4_linux-64bit -P ${ port } -l > " ${ outputdir } /projects.tmp "
2017-11-05 16:25:42 +01:00
projectids = ( $( cut -c 2-14 " ${ outputdir } /projects.tmp " ) )
projectnames = ( $( cut -c 17- " ${ outputdir } /projects.tmp " ) )
2017-03-14 23:17:33 +01:00
cat " ${ outputdir } /projects.tmp " && rm " ${ outputdir : ? } /projects.tmp "
echo ""
# provide additional OpenRefine projects for cross function
if [ -n " $crossprojects " ] ; then
echo "provide additional projects for cross function..."
# copy given projects to workspace
rsync -a --exclude= '*.project/history' " ${ crossdir } " /*.project " ${ outputdir } "
# restart server to advertise copied projects
echo "restart OpenRefine server to advertise copied projects..."
kill ${ pid }
wait
echo ""
openrefine/refine -p ${ port } -d " ${ outputdir } " -m ${ ram } &
pid = $!
until wget -q -O - http://localhost:${ port } | cat | grep -q -o "OpenRefine" ; do sleep 1; done
echo ""
fi
# loop for all projects
for ( ( i = 0; i<${# projectids [@] } ; ++i) ) ; do
# apply transformation rules
if [ -n " $jsonfiles " ] ; then
checkpoints = ${# checkpointdate [@] }
2017-11-05 16:25:42 +01:00
checkpointdate[ $(( checkpoints + 1 )) ] = $( date +%s)
checkpointname[ $(( checkpoints + 1 )) ] = " Transform ${ projectnames [i] } "
echo " === $checkpoints . ${ checkpointname [ $(( checkpoints + 1 )) ] } === "
2017-03-14 23:17:33 +01:00
echo ""
2017-11-05 16:25:42 +01:00
echo " starting time: $( date --date= @${ checkpointdate [ $(( checkpoints + 1 )) ] } ) "
2017-03-14 23:17:33 +01:00
echo ""
for jsonfile in " ${ jsonfiles [@] } " ; do
echo " transform ${ jsonfile } ... "
# run client with apply command
2017-12-11 21:57:48 +01:00
openrefine-client/openrefine-client_0-3-4_linux-64bit -P ${ port } -f ${ configdir } /${ jsonfile } ${ projectids [i] }
2017-03-14 23:17:33 +01:00
# allocated system resources
ps -o start,etime,%mem,%cpu,rss -p ${ pid } --sort= start
memoryload += ( $( ps --no-headers -o rss -p ${ pid } ) )
echo ""
# restart server to clear memory
if [ " $restarttransform " = "true" ] ; then
echo "save project and restart OpenRefine server..."
kill ${ pid }
wait
echo ""
openrefine/refine -p ${ port } -d " ${ outputdir } " -m ${ ram } &
pid = $!
until wget -q -O - http://localhost:${ port } | cat | grep -q -o "OpenRefine" ; do sleep 1; done
fi
echo ""
done
fi
# export project to workspace
if [ " $export " = "true" ] ; then
checkpoints = ${# checkpointdate [@] }
2017-11-05 16:25:42 +01:00
checkpointdate[ $(( checkpoints + 1 )) ] = $( date +%s)
checkpointname[ $(( checkpoints + 1 )) ] = " Export ${ projectnames [i] } "
echo " === $checkpoints . ${ checkpointname [ $(( checkpoints + 1 )) ] } === "
2017-03-14 23:17:33 +01:00
echo ""
2017-11-05 16:25:42 +01:00
echo " starting time: $( date --date= @${ checkpointdate [ $(( checkpoints + 1 )) ] } ) "
2017-03-14 23:17:33 +01:00
echo ""
# get filename without extension
filename = ${ projectnames [i]%.* }
2017-10-28 00:47:51 +02:00
echo " export to file ${ filename } . ${ exportformat } ... "
2017-03-14 23:17:33 +01:00
# run client with export command
2017-12-11 21:57:48 +01:00
openrefine-client/openrefine-client_0-3-4_linux-64bit -P ${ port } -E --output= " ${ outputdir } / ${ filename } . ${ exportformat } " " ${ templating [@] } " ${ projectids [i] }
2017-03-14 23:17:33 +01:00
# show allocated system resources
ps -o start,etime,%mem,%cpu,rss -p ${ pid } --sort= start
memoryload += ( $( ps --no-headers -o rss -p ${ pid } ) )
echo ""
fi
# restart server to clear memory
if [ " $restartfile " = "true" ] ; then
echo "restart OpenRefine server..."
kill ${ pid }
wait
echo ""
openrefine/refine -p ${ port } -d " ${ outputdir } " -m ${ ram } &
pid = $!
until wget -q -O - http://localhost:${ port } | cat | grep -q -o "OpenRefine" ; do sleep 1; done
fi
echo ""
done
# list output files
if [ " $export " = "true" ] ; then
echo "output (number of lines / size in bytes):"
2017-10-28 00:47:51 +02:00
wc -c -l " ${ outputdir } " /*.${ exportformat }
2017-03-14 23:17:33 +01:00
echo ""
fi
2017-03-01 17:48:13 +01:00
fi
2017-02-27 00:47:34 +01:00
2017-11-07 21:24:14 +01:00
# run cleanup function
cleanup
echo ""
2017-03-14 23:17:33 +01:00
# calculate and print checkpoints
echo "=== Statistics ==="
echo ""
checkpoints = ${# checkpointdate [@] }
2017-11-05 16:25:42 +01:00
checkpointdate[ $(( checkpoints + 1 )) ] = $( date +%s)
checkpointname[ $(( checkpoints + 1 )) ] = "End process"
2017-03-14 23:17:33 +01:00
echo "starting time and run time of each step:"
checkpoints = ${# checkpointdate [@] }
2017-11-05 16:25:42 +01:00
checkpointdate[ $(( checkpoints + 1 )) ] = $( date +%s)
2017-03-14 23:17:33 +01:00
for i in $( seq 1 $checkpoints ) ; do
2017-11-05 16:25:42 +01:00
diffsec = " $(( ${ checkpointdate [ $(( i + 1 )) ] } - ${ checkpointdate [ $i ] } )) "
2017-03-14 23:17:33 +01:00
printf " %35s $( date --date= @${ checkpointdate [ $i ] } ) ( $( date -d@${ diffsec } -u +%H:%M:%S) )\n " " ${ checkpointname [ $i ] } "
done
echo ""
2017-11-05 16:40:01 +01:00
diffsec = " $(( checkpointdate[ $checkpoints ] - checkpointdate[ 1 ] )) "
2017-03-14 23:17:33 +01:00
echo " total run time: $( date -d@${ diffsec } -u +%H:%M:%S) (hh:mm:ss) "
2017-06-20 14:47:30 +02:00
2017-03-14 23:17:33 +01:00
# calculate and print memory load
max = ${ memoryload [0] }
for n in " ${ memoryload [@] } " ; do
( ( n > max) ) && max = $n
done
2017-11-05 16:25:42 +01:00
echo " highest memory load: $(( max / 1024 )) MB "