release v1.11, added support for templating export
This commit is contained in:
parent
52fff4281b
commit
d60f732244
46
README.md
46
README.md
|
@ -90,33 +90,46 @@ Usage: ./openrefine-batch.sh [-a INPUTDIR] [-b TRANSFORMDIR] [-c OUTPUTDIR] ...
|
||||||
-i INPUTOPTIONS several options provided by openrefine-client, see below...
|
-i INPUTOPTIONS several options provided by openrefine-client, see below...
|
||||||
-m RAM maximum RAM for OpenRefine java heap space (default: 2048M)
|
-m RAM maximum RAM for OpenRefine java heap space (default: 2048M)
|
||||||
-p PORT PORT on which OpenRefine should listen (default: 3333)
|
-p PORT PORT on which OpenRefine should listen (default: 3333)
|
||||||
|
-t TEMPLATING several options for templating export, see below...
|
||||||
-E do NOT export files
|
-E do NOT export files
|
||||||
-R do NOT restart OpenRefine after each transformation (e.g. config file)
|
-R do NOT restart OpenRefine after each transformation (e.g. config file)
|
||||||
-X do NOT restart OpenRefine after each project (e.g. input file)
|
-X do NOT restart OpenRefine after each project (e.g. input file)
|
||||||
-h displays this help screen
|
-h displays this help screen
|
||||||
|
|
||||||
== inputoptions (mandatory for xml, json, fixed-width, xslx, ods) ==
|
== inputoptions (mandatory for xml, json, fixed-width, xslx, ods) ==
|
||||||
-i recordPath=RECORDPATH (xml, json): please provide path in multiple arguments without slashes, e.g. /collection/record/ should be entered like this: --recordPath=collection --recordPath=record
|
-i recordPath=RECORDPATH (xml, json): please provide path in multiple arguments without slashes, e.g. /collection/record/ should be entered like this: -i recordPath=collection -i recordPath=record, default xml: record, default json: _ _
|
||||||
-i columnWidths=COLUMNWIDTHS (fixed-width): please provide widths separated by comma (e.g. 7,5)
|
-i columnWidths=COLUMNWIDTHS (fixed-width): please provide widths separated by comma (e.g. 7,5)
|
||||||
-i sheets=SHEETS (xlsx, ods): please provide sheets separated by comma (e.g. 0,1), default: 0 (first sheet)
|
-i sheets=SHEETS (xls, xlsx, ods): please provide sheets separated by comma (e.g. 0,1), default: 0 (first sheet)
|
||||||
|
|
||||||
== more inputoptions (optional, only together with inputformat) ==
|
== more inputoptions (optional, only together with inputformat) ==
|
||||||
-i projectName=PROJECTNAME (all formats)
|
-i projectName=PROJECTNAME (all formats), default: filename
|
||||||
-i limit=LIMIT (all formats), default: -1
|
-i limit=LIMIT (all formats), default: -1
|
||||||
-i includeFileSources=INCLUDEFILESOURCES (all formats), default: false
|
-i includeFileSources=true/false (all formats), default: false
|
||||||
-i trimStrings=TRIMSTRINGS (xml, json), default: false
|
-i trimStrings=true/false (xml, json), default: false
|
||||||
-i storeEmptyStrings=STOREEMPTYSTRINGS (xml, json), default: true
|
-i storeEmptyStrings=true/false (xml, json), default: true
|
||||||
-i guessCellValueTypes=GUESSCELLVALUETYPES (xml, csv, tsv, fixed-width, json), default: false
|
-i guessCellValueTypes=true/false (xml, csv, tsv, fixed-width, json), default: false
|
||||||
-i encoding=ENCODING (csv, tsv, line-based, fixed-width), please provide short encoding name (e.g. UTF-8)
|
-i encoding=ENCODING (csv, tsv, line-based, fixed-width), please provide short encoding name (e.g. UTF-8)
|
||||||
-i ignoreLines=IGNORELINES (csv, tsv, line-based, fixed-width, xlsx, ods), default: -1
|
-i ignoreLines=IGNORELINES (csv, tsv, line-based, fixed-width, xls, xlsx, ods), default: -1
|
||||||
-i headerLines=HEADERLINES (csv, tsv, fixed-width, xlsx, ods), default: 1
|
-i headerLines=HEADERLINES (csv, tsv, fixed-width, xls, xlsx, ods), default: 1, default fixed-width: 0
|
||||||
-i skipDataLines=SKIPDATALINES (csv, tsv, line-based, fixed-width, xlsx, ods), default: 0
|
-i skipDataLines=true/false (csv, tsv, line-based, fixed-width, xls, xlsx, ods), default: 0, default line-based: -1
|
||||||
-i storeBlankRows=STOREBLANKROWS (csv, tsv, line-based, fixed-width, xlsx, ods), default: true
|
-i storeBlankRows=true/false (csv, tsv, line-based, fixed-width, xls, xlsx, ods), default: true
|
||||||
-i processQuotes=PROCESSQUOTES (csv, tsv), default: true
|
-i processQuotes=true/false (csv, tsv), default: true
|
||||||
-i storeBlankCellsAsNulls=STOREBLANKCELLSASNULLS (csv, tsv, line-based, fixed-width, xlsx, ods), default: true
|
-i storeBlankCellsAsNulls=true/false (csv, tsv, line-based, fixed-width, xls, xlsx, ods), default: true
|
||||||
-i linesPerRow=LINESPERROW (line-based), default: 1
|
-i linesPerRow=LINESPERROW (line-based), default: 1
|
||||||
|
|
||||||
== example ==
|
== templating options (alternative exportformat) ==
|
||||||
|
-t template=TEMPLATE (mandatory; (big) text string that you enter in the *row template* textfield in the export/templating menu in the browser app)
|
||||||
|
-t mode=row-based/record-based (engine mode, default: row-based)
|
||||||
|
-t prefix=PREFIX (text string that you enter in the *prefix* textfield in the browser app)
|
||||||
|
-t rowSeparator=ROWSEPARATOR (text string that you enter in the *row separator* textfield in the browser app)
|
||||||
|
-t suffix=SUFFIX (text string that you enter in the *suffix* textfield in the browser app)
|
||||||
|
-t filterQuery=REGEX (Simple RegEx text filter on filterColumn, e.g. ^12015$)
|
||||||
|
-t filterColumn=COLUMNNAME (column name for filterQuery, default: name of first column)
|
||||||
|
-t facets=FACETS (facets config in json format, may be extracted with browser dev tools in browser app)
|
||||||
|
-t splitToFiles=true/false (will split each row/record into a single file; it specifies a presumably unique character series for splitting; prefix and suffix will be applied to all files
|
||||||
|
-t suffixById=true/false (enhancement option for splitToFiles; will generate filename-suffix from values in key column)
|
||||||
|
|
||||||
|
== examples ==
|
||||||
|
|
||||||
download example data
|
download example data
|
||||||
|
|
||||||
|
@ -125,10 +138,13 @@ unzip master.zip openrefine-batch-master/examples/*
|
||||||
mv openrefine-batch-master/examples .
|
mv openrefine-batch-master/examples .
|
||||||
rm -f master.zip
|
rm -f master.zip
|
||||||
|
|
||||||
execute openrefine-batch.sh
|
example 1 (input, transform, export to tsv)
|
||||||
|
|
||||||
./openrefine-batch.sh -a examples/powerhouse-museum/input/ -b examples/powerhouse-museum/config/ -c examples/powerhouse-museum/output/ -f tsv -i processQuotes=false -i guessCellValueTypes=true -RX
|
./openrefine-batch.sh -a examples/powerhouse-museum/input/ -b examples/powerhouse-museum/config/ -c examples/powerhouse-museum/output/ -f tsv -i processQuotes=false -i guessCellValueTypes=true -RX
|
||||||
|
|
||||||
|
example 2 (input, transform, templating export)
|
||||||
|
|
||||||
|
./openrefine-batch.sh -a examples/powerhouse-museum/input/ -b examples/powerhouse-museum/config/ -c examples/powerhouse-museum/output/ -f tsv -i processQuotes=false -i guessCellValueTypes=true -RX -t template='{ "Record ID" : {{jsonize(cells["Record ID"].value)}}, "Object Title" : {{jsonize(cells["Object Title"].value)}}, "Registration Number" : {{jsonize(cells["Registration Number"].value)}}, "Description." : {{jsonize(cells["Description."].value)}}, "Marks" : {{jsonize(cells["Marks"].value)}}, "Production Date" : {{jsonize(cells["Production Date"].value)}}, "Provenance (Production)" : {{jsonize(cells["Provenance (Production)"].value)}}, "Provenance (History)" : {{jsonize(cells["Provenance (History)"].value)}}, "Categories" : {{jsonize(cells["Categories"].value)}}, "Persistent Link" : {{jsonize(cells["Persistent Link"].value)}}, "Height" : {{jsonize(cells["Height"].value)}}, "Width" : {{jsonize(cells["Width"].value)}}, "Depth" : {{jsonize(cells["Depth"].value)}}, "Diameter" : {{jsonize(cells["Diameter"].value)}}, "Weight" : {{jsonize(cells["Weight"].value)}}, "License info" : {{jsonize(cells["License info"].value)}} }' -t rowSeparator=',' -t prefix='{ "rows" : [ ' -t suffix='] }' -t splitToFiles=true
|
||||||
```
|
```
|
||||||
|
|
||||||
### Logging
|
### Logging
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
# openrefine-batch-docker.sh, Felix Lohmeier, v1.10, 2017-11-07
|
# openrefine-batch-docker.sh, Felix Lohmeier, v1.11, 2017-12-11
|
||||||
# https://github.com/felixlohmeier/openrefine-batch
|
# https://github.com/felixlohmeier/openrefine-batch
|
||||||
|
|
||||||
# check system requirements
|
# check system requirements
|
||||||
|
@ -30,6 +30,7 @@ Usage: sudo ./openrefine-batch-docker.sh [-a INPUTDIR] [-b TRANSFORMDIR] [-c OUT
|
||||||
-f INPUTFORMAT (csv, tsv, xml, json, line-based, fixed-width, xlsx, ods)
|
-f INPUTFORMAT (csv, tsv, xml, json, line-based, fixed-width, xlsx, ods)
|
||||||
-i INPUTOPTIONS several options provided by openrefine-client, see below...
|
-i INPUTOPTIONS several options provided by openrefine-client, see below...
|
||||||
-m RAM maximum RAM for OpenRefine java heap space (default: 2048M)
|
-m RAM maximum RAM for OpenRefine java heap space (default: 2048M)
|
||||||
|
-t TEMPLATING several options for templating export, see below...
|
||||||
-v VERSION OpenRefine version (2.7, 2.7rc2, 2.7rc1, 2.6rc2, 2.6rc1, dev; default: 2.7)
|
-v VERSION OpenRefine version (2.7, 2.7rc2, 2.7rc1, 2.6rc2, 2.6rc1, dev; default: 2.7)
|
||||||
-E do NOT export files
|
-E do NOT export files
|
||||||
-R do NOT restart OpenRefine after each transformation (e.g. config file)
|
-R do NOT restart OpenRefine after each transformation (e.g. config file)
|
||||||
|
@ -37,27 +38,39 @@ Usage: sudo ./openrefine-batch-docker.sh [-a INPUTDIR] [-b TRANSFORMDIR] [-c OUT
|
||||||
-h displays this help screen
|
-h displays this help screen
|
||||||
|
|
||||||
== inputoptions (mandatory for xml, json, fixed-width, xslx, ods) ==
|
== inputoptions (mandatory for xml, json, fixed-width, xslx, ods) ==
|
||||||
-i recordPath=RECORDPATH (xml, json): please provide path in multiple arguments without slashes, e.g. /collection/record/ should be entered like this: --recordPath=collection --recordPath=record
|
-i recordPath=RECORDPATH (xml, json): please provide path in multiple arguments without slashes, e.g. /collection/record/ should be entered like this: -i recordPath=collection -i recordPath=record, default xml: record, default json: _ _
|
||||||
-i columnWidths=COLUMNWIDTHS (fixed-width): please provide widths separated by comma (e.g. 7,5)
|
-i columnWidths=COLUMNWIDTHS (fixed-width): please provide widths separated by comma (e.g. 7,5)
|
||||||
-i sheets=SHEETS (xlsx, ods): please provide sheets separated by comma (e.g. 0,1), default: 0 (first sheet)
|
-i sheets=SHEETS (xls, xlsx, ods): please provide sheets separated by comma (e.g. 0,1), default: 0 (first sheet)
|
||||||
|
|
||||||
== more inputoptions (optional, only together with inputformat) ==
|
== more inputoptions (optional, only together with inputformat) ==
|
||||||
-i projectName=PROJECTNAME (all formats)
|
-i projectName=PROJECTNAME (all formats), default: filename
|
||||||
-i limit=LIMIT (all formats), default: -1
|
-i limit=LIMIT (all formats), default: -1
|
||||||
-i includeFileSources=INCLUDEFILESOURCES (all formats), default: false
|
-i includeFileSources=true/false (all formats), default: false
|
||||||
-i trimStrings=TRIMSTRINGS (xml, json), default: false
|
-i trimStrings=true/false (xml, json), default: false
|
||||||
-i storeEmptyStrings=STOREEMPTYSTRINGS (xml, json), default: true
|
-i storeEmptyStrings=true/false (xml, json), default: true
|
||||||
-i guessCellValueTypes=GUESSCELLVALUETYPES (xml, csv, tsv, fixed-width, json), default: false
|
-i guessCellValueTypes=true/false (xml, csv, tsv, fixed-width, json), default: false
|
||||||
-i encoding=ENCODING (csv, tsv, line-based, fixed-width), please provide short encoding name (e.g. UTF-8)
|
-i encoding=ENCODING (csv, tsv, line-based, fixed-width), please provide short encoding name (e.g. UTF-8)
|
||||||
-i ignoreLines=IGNORELINES (csv, tsv, line-based, fixed-width, xlsx, ods), default: -1
|
-i ignoreLines=IGNORELINES (csv, tsv, line-based, fixed-width, xls, xlsx, ods), default: -1
|
||||||
-i headerLines=HEADERLINES (csv, tsv, fixed-width, xlsx, ods), default: 1
|
-i headerLines=HEADERLINES (csv, tsv, fixed-width, xls, xlsx, ods), default: 1, default fixed-width: 0
|
||||||
-i skipDataLines=SKIPDATALINES (csv, tsv, line-based, fixed-width, xlsx, ods), default: 0
|
-i skipDataLines=true/false (csv, tsv, line-based, fixed-width, xls, xlsx, ods), default: 0, default line-based: -1
|
||||||
-i storeBlankRows=STOREBLANKROWS (csv, tsv, line-based, fixed-width, xlsx, ods), default: true
|
-i storeBlankRows=true/false (csv, tsv, line-based, fixed-width, xls, xlsx, ods), default: true
|
||||||
-i processQuotes=PROCESSQUOTES (csv, tsv), default: true
|
-i processQuotes=true/false (csv, tsv), default: true
|
||||||
-i storeBlankCellsAsNulls=STOREBLANKCELLSASNULLS (csv, tsv, line-based, fixed-width, xlsx, ods), default: true
|
-i storeBlankCellsAsNulls=true/false (csv, tsv, line-based, fixed-width, xls, xlsx, ods), default: true
|
||||||
-i linesPerRow=LINESPERROW (line-based), default: 1
|
-i linesPerRow=LINESPERROW (line-based), default: 1
|
||||||
|
|
||||||
== example ==
|
== templating options (alternative exportformat) ==
|
||||||
|
-t template=TEMPLATE (mandatory; (big) text string that you enter in the *row template* textfield in the export/templating menu in the browser app)
|
||||||
|
-t mode=row-based/record-based (engine mode, default: row-based)
|
||||||
|
-t prefix=PREFIX (text string that you enter in the *prefix* textfield in the browser app)
|
||||||
|
-t rowSeparator=ROWSEPARATOR (text string that you enter in the *row separator* textfield in the browser app)
|
||||||
|
-t suffix=SUFFIX (text string that you enter in the *suffix* textfield in the browser app)
|
||||||
|
-t filterQuery=REGEX (Simple RegEx text filter on filterColumn, e.g. ^12015$)
|
||||||
|
-t filterColumn=COLUMNNAME (column name for filterQuery, default: name of first column)
|
||||||
|
-t facets=FACETS (facets config in json format, may be extracted with browser dev tools in browser app)
|
||||||
|
-t splitToFiles=true/false (will split each row/record into a single file; it specifies a presumably unique character series for splitting; prefix and suffix will be applied to all files
|
||||||
|
-t suffixById=true/false (enhancement option for splitToFiles; will generate filename-suffix from values in key column)
|
||||||
|
|
||||||
|
== examples ==
|
||||||
|
|
||||||
download example data
|
download example data
|
||||||
|
|
||||||
|
@ -66,7 +79,7 @@ unzip master.zip openrefine-batch-master/examples/*
|
||||||
mv openrefine-batch-master/examples .
|
mv openrefine-batch-master/examples .
|
||||||
rm -f master.zip
|
rm -f master.zip
|
||||||
|
|
||||||
execute openrefine-batch-docker.sh
|
example 1 (input, transform, export to tsv)
|
||||||
|
|
||||||
sudo ./openrefine-batch-docker.sh \
|
sudo ./openrefine-batch-docker.sh \
|
||||||
-a examples/powerhouse-museum/input/ \
|
-a examples/powerhouse-museum/input/ \
|
||||||
|
@ -77,6 +90,9 @@ sudo ./openrefine-batch-docker.sh \
|
||||||
-i guessCellValueTypes=true \
|
-i guessCellValueTypes=true \
|
||||||
-RX
|
-RX
|
||||||
|
|
||||||
|
example 2 (input, transform, templating export)
|
||||||
|
|
||||||
|
sudo ./openrefine-batch-docker.sh -a examples/powerhouse-museum/input/ -b examples/powerhouse-museum/config/ -c examples/powerhouse-museum/output/ -f tsv -i processQuotes=false -i guessCellValueTypes=true -RX -t template='{ "Record ID" : {{jsonize(cells["Record ID"].value)}}, "Object Title" : {{jsonize(cells["Object Title"].value)}}, "Registration Number" : {{jsonize(cells["Registration Number"].value)}}, "Description." : {{jsonize(cells["Description."].value)}}, "Marks" : {{jsonize(cells["Marks"].value)}}, "Production Date" : {{jsonize(cells["Production Date"].value)}}, "Provenance (Production)" : {{jsonize(cells["Provenance (Production)"].value)}}, "Provenance (History)" : {{jsonize(cells["Provenance (History)"].value)}}, "Categories" : {{jsonize(cells["Categories"].value)}}, "Persistent Link" : {{jsonize(cells["Persistent Link"].value)}}, "Height" : {{jsonize(cells["Height"].value)}}, "Width" : {{jsonize(cells["Width"].value)}}, "Depth" : {{jsonize(cells["Depth"].value)}}, "Diameter" : {{jsonize(cells["Diameter"].value)}}, "Weight" : {{jsonize(cells["Weight"].value)}}, "License info" : {{jsonize(cells["License info"].value)}} }' -t rowSeparator=',' -t prefix='{ "rows" : [ ' -t suffix='] }' -t splitToFiles=true
|
||||||
EOF
|
EOF
|
||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
|
@ -99,7 +115,7 @@ if [ "$NUMARGS" -eq 0 ]; then
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# get user input
|
# get user input
|
||||||
options="a:b:c:d:e:f:i:m:p:ERXh"
|
options="a:b:c:d:e:f:i:m:t:v:ERXh"
|
||||||
while getopts $options opt; do
|
while getopts $options opt; do
|
||||||
case $opt in
|
case $opt in
|
||||||
a ) inputdir=$(readlink -f ${OPTARG}); if [ -n "${inputdir// }" ] ; then inputfiles=($(find -L "${inputdir}"/* -type f -printf "%f\n" 2>/dev/null)); fi ;;
|
a ) inputdir=$(readlink -f ${OPTARG}); if [ -n "${inputdir// }" ] ; then inputfiles=($(find -L "${inputdir}"/* -type f -printf "%f\n" 2>/dev/null)); fi ;;
|
||||||
|
@ -110,6 +126,7 @@ while getopts $options opt; do
|
||||||
f ) format="${OPTARG}" ; inputformat="--format=${OPTARG}" ;;
|
f ) format="${OPTARG}" ; inputformat="--format=${OPTARG}" ;;
|
||||||
i ) inputoptions+=("--${OPTARG}") ;;
|
i ) inputoptions+=("--${OPTARG}") ;;
|
||||||
m ) ram=${OPTARG} ;;
|
m ) ram=${OPTARG} ;;
|
||||||
|
t ) templating+=("--${OPTARG}") ; exportformat="txt" ;;
|
||||||
v ) version=${OPTARG} ;;
|
v ) version=${OPTARG} ;;
|
||||||
E ) export="false" ;;
|
E ) export="false" ;;
|
||||||
R ) restarttransform="false" ;;
|
R ) restarttransform="false" ;;
|
||||||
|
@ -167,6 +184,7 @@ echo "OpenRefine version: $version"
|
||||||
echo "OpenRefine workspace: $outputdir"
|
echo "OpenRefine workspace: $outputdir"
|
||||||
echo "Export to workspace: $export"
|
echo "Export to workspace: $export"
|
||||||
echo "Export format: $exportformat"
|
echo "Export format: $exportformat"
|
||||||
|
echo "Templating options: ${templating[*]}"
|
||||||
echo "Docker container name: $uuid"
|
echo "Docker container name: $uuid"
|
||||||
echo "restart after file: $restartfile"
|
echo "restart after file: $restartfile"
|
||||||
echo "restart after transform: $restarttransform"
|
echo "restart after transform: $restarttransform"
|
||||||
|
@ -316,7 +334,7 @@ if [ -n "$jsonfiles" ] || [ "$export" = "true" ]; then
|
||||||
filename=${projectnames[i]%.*}
|
filename=${projectnames[i]%.*}
|
||||||
echo "export to file ${filename}.${exportformat}..."
|
echo "export to file ${filename}.${exportformat}..."
|
||||||
# run client with export command
|
# run client with export command
|
||||||
sudo docker run --rm --link ${uuid} -v ${outputdir}:/data:z felixlohmeier/openrefine-client -H ${uuid} -E --output="${filename}.${exportformat}" ${projectids[i]}
|
sudo docker run --rm --link ${uuid} -v ${outputdir}:/data:z felixlohmeier/openrefine-client -H ${uuid} -E --output="${filename}.${exportformat}" "${templating[@]}" ${projectids[i]}
|
||||||
# show allocated system resources
|
# show allocated system resources
|
||||||
ps -o start,etime,%mem,%cpu,rss -C java --sort=start
|
ps -o start,etime,%mem,%cpu,rss -C java --sort=start
|
||||||
memoryload+=($(ps --no-headers -o rss -C java))
|
memoryload+=($(ps --no-headers -o rss -C java))
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
# openrefine-batch.sh, Felix Lohmeier, v1.10, 2017-11-07
|
# openrefine-batch.sh, Felix Lohmeier, v1.11, 2017-12-11
|
||||||
# https://github.com/felixlohmeier/openrefine-batch
|
# https://github.com/felixlohmeier/openrefine-batch
|
||||||
|
|
||||||
# declare download URLs for OpenRefine and OpenRefine client
|
# declare download URLs for OpenRefine and OpenRefine client
|
||||||
openrefine_URL="https://github.com/opencultureconsulting/openrefine-batch/raw/master/src/openrefine-linux-2017-10-28.tar.gz"
|
openrefine_URL="https://github.com/opencultureconsulting/openrefine-batch/raw/master/src/openrefine-linux-2017-10-28.tar.gz"
|
||||||
client_URL="https://github.com/opencultureconsulting/openrefine-batch/raw/master/src/openrefine-client_0-3-1_linux-64bit"
|
client_URL="https://github.com/opencultureconsulting/openrefine-batch/raw/master/src/openrefine-client_0-3-4_linux-64bit"
|
||||||
|
|
||||||
# check system requirements
|
# check system requirements
|
||||||
JAVA="$(which java 2> /dev/null)"
|
JAVA="$(which java 2> /dev/null)"
|
||||||
|
@ -34,7 +34,7 @@ if [ ! -d "openrefine-client" ]; then
|
||||||
echo "Download OpenRefine client..."
|
echo "Download OpenRefine client..."
|
||||||
mkdir -p openrefine-client
|
mkdir -p openrefine-client
|
||||||
wget -q -P openrefine-client $wget_opt $client_URL
|
wget -q -P openrefine-client $wget_opt $client_URL
|
||||||
chmod +x openrefine-client/openrefine-client_0-3-1_linux-64bit
|
chmod +x openrefine-client/openrefine-client_0-3-4_linux-64bit
|
||||||
echo ""
|
echo ""
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -55,33 +55,46 @@ Usage: ./openrefine-batch.sh [-a INPUTDIR] [-b TRANSFORMDIR] [-c OUTPUTDIR] ...
|
||||||
-i INPUTOPTIONS several options provided by openrefine-client, see below...
|
-i INPUTOPTIONS several options provided by openrefine-client, see below...
|
||||||
-m RAM maximum RAM for OpenRefine java heap space (default: 2048M)
|
-m RAM maximum RAM for OpenRefine java heap space (default: 2048M)
|
||||||
-p PORT PORT on which OpenRefine should listen (default: 3333)
|
-p PORT PORT on which OpenRefine should listen (default: 3333)
|
||||||
|
-t TEMPLATING several options for templating export, see below...
|
||||||
-E do NOT export files
|
-E do NOT export files
|
||||||
-R do NOT restart OpenRefine after each transformation (e.g. config file)
|
-R do NOT restart OpenRefine after each transformation (e.g. config file)
|
||||||
-X do NOT restart OpenRefine after each project (e.g. input file)
|
-X do NOT restart OpenRefine after each project (e.g. input file)
|
||||||
-h displays this help screen
|
-h displays this help screen
|
||||||
|
|
||||||
== inputoptions (mandatory for xml, json, fixed-width, xslx, ods) ==
|
== inputoptions (mandatory for xml, json, fixed-width, xslx, ods) ==
|
||||||
-i recordPath=RECORDPATH (xml, json): please provide path in multiple arguments without slashes, e.g. /collection/record/ should be entered like this: --recordPath=collection --recordPath=record
|
-i recordPath=RECORDPATH (xml, json): please provide path in multiple arguments without slashes, e.g. /collection/record/ should be entered like this: -i recordPath=collection -i recordPath=record, default xml: record, default json: _ _
|
||||||
-i columnWidths=COLUMNWIDTHS (fixed-width): please provide widths separated by comma (e.g. 7,5)
|
-i columnWidths=COLUMNWIDTHS (fixed-width): please provide widths separated by comma (e.g. 7,5)
|
||||||
-i sheets=SHEETS (xlsx, ods): please provide sheets separated by comma (e.g. 0,1), default: 0 (first sheet)
|
-i sheets=SHEETS (xls, xlsx, ods): please provide sheets separated by comma (e.g. 0,1), default: 0 (first sheet)
|
||||||
|
|
||||||
== more inputoptions (optional, only together with inputformat) ==
|
== more inputoptions (optional, only together with inputformat) ==
|
||||||
-i projectName=PROJECTNAME (all formats)
|
-i projectName=PROJECTNAME (all formats), default: filename
|
||||||
-i limit=LIMIT (all formats), default: -1
|
-i limit=LIMIT (all formats), default: -1
|
||||||
-i includeFileSources=INCLUDEFILESOURCES (all formats), default: false
|
-i includeFileSources=true/false (all formats), default: false
|
||||||
-i trimStrings=TRIMSTRINGS (xml, json), default: false
|
-i trimStrings=true/false (xml, json), default: false
|
||||||
-i storeEmptyStrings=STOREEMPTYSTRINGS (xml, json), default: true
|
-i storeEmptyStrings=true/false (xml, json), default: true
|
||||||
-i guessCellValueTypes=GUESSCELLVALUETYPES (xml, csv, tsv, fixed-width, json), default: false
|
-i guessCellValueTypes=true/false (xml, csv, tsv, fixed-width, json), default: false
|
||||||
-i encoding=ENCODING (csv, tsv, line-based, fixed-width), please provide short encoding name (e.g. UTF-8)
|
-i encoding=ENCODING (csv, tsv, line-based, fixed-width), please provide short encoding name (e.g. UTF-8)
|
||||||
-i ignoreLines=IGNORELINES (csv, tsv, line-based, fixed-width, xlsx, ods), default: -1
|
-i ignoreLines=IGNORELINES (csv, tsv, line-based, fixed-width, xls, xlsx, ods), default: -1
|
||||||
-i headerLines=HEADERLINES (csv, tsv, fixed-width, xlsx, ods), default: 1
|
-i headerLines=HEADERLINES (csv, tsv, fixed-width, xls, xlsx, ods), default: 1, default fixed-width: 0
|
||||||
-i skipDataLines=SKIPDATALINES (csv, tsv, line-based, fixed-width, xlsx, ods), default: 0
|
-i skipDataLines=true/false (csv, tsv, line-based, fixed-width, xls, xlsx, ods), default: 0, default line-based: -1
|
||||||
-i storeBlankRows=STOREBLANKROWS (csv, tsv, line-based, fixed-width, xlsx, ods), default: true
|
-i storeBlankRows=true/false (csv, tsv, line-based, fixed-width, xls, xlsx, ods), default: true
|
||||||
-i processQuotes=PROCESSQUOTES (csv, tsv), default: true
|
-i processQuotes=true/false (csv, tsv), default: true
|
||||||
-i storeBlankCellsAsNulls=STOREBLANKCELLSASNULLS (csv, tsv, line-based, fixed-width, xlsx, ods), default: true
|
-i storeBlankCellsAsNulls=true/false (csv, tsv, line-based, fixed-width, xls, xlsx, ods), default: true
|
||||||
-i linesPerRow=LINESPERROW (line-based), default: 1
|
-i linesPerRow=LINESPERROW (line-based), default: 1
|
||||||
|
|
||||||
== example ==
|
== templating options (alternative exportformat) ==
|
||||||
|
-t template=TEMPLATE (mandatory; (big) text string that you enter in the *row template* textfield in the export/templating menu in the browser app)
|
||||||
|
-t mode=row-based/record-based (engine mode, default: row-based)
|
||||||
|
-t prefix=PREFIX (text string that you enter in the *prefix* textfield in the browser app)
|
||||||
|
-t rowSeparator=ROWSEPARATOR (text string that you enter in the *row separator* textfield in the browser app)
|
||||||
|
-t suffix=SUFFIX (text string that you enter in the *suffix* textfield in the browser app)
|
||||||
|
-t filterQuery=REGEX (Simple RegEx text filter on filterColumn, e.g. ^12015$)
|
||||||
|
-t filterColumn=COLUMNNAME (column name for filterQuery, default: name of first column)
|
||||||
|
-t facets=FACETS (facets config in json format, may be extracted with browser dev tools in browser app)
|
||||||
|
-t splitToFiles=true/false (will split each row/record into a single file; it specifies a presumably unique character series for splitting; prefix and suffix will be applied to all files
|
||||||
|
-t suffixById=true/false (enhancement option for splitToFiles; will generate filename-suffix from values in key column)
|
||||||
|
|
||||||
|
== examples ==
|
||||||
|
|
||||||
download example data
|
download example data
|
||||||
|
|
||||||
|
@ -90,7 +103,7 @@ unzip master.zip openrefine-batch-master/examples/*
|
||||||
mv openrefine-batch-master/examples .
|
mv openrefine-batch-master/examples .
|
||||||
rm -f master.zip
|
rm -f master.zip
|
||||||
|
|
||||||
execute openrefine-batch.sh
|
example 1 (input, transform, export to tsv)
|
||||||
|
|
||||||
./openrefine-batch.sh \
|
./openrefine-batch.sh \
|
||||||
-a examples/powerhouse-museum/input/ \
|
-a examples/powerhouse-museum/input/ \
|
||||||
|
@ -101,6 +114,9 @@ execute openrefine-batch.sh
|
||||||
-i guessCellValueTypes=true \
|
-i guessCellValueTypes=true \
|
||||||
-RX
|
-RX
|
||||||
|
|
||||||
|
example 2 (input, transform, templating export)
|
||||||
|
|
||||||
|
./openrefine-batch.sh -a examples/powerhouse-museum/input/ -b examples/powerhouse-museum/config/ -c examples/powerhouse-museum/output/ -f tsv -i processQuotes=false -i guessCellValueTypes=true -RX -t template='{ "Record ID" : {{jsonize(cells["Record ID"].value)}}, "Object Title" : {{jsonize(cells["Object Title"].value)}}, "Registration Number" : {{jsonize(cells["Registration Number"].value)}}, "Description." : {{jsonize(cells["Description."].value)}}, "Marks" : {{jsonize(cells["Marks"].value)}}, "Production Date" : {{jsonize(cells["Production Date"].value)}}, "Provenance (Production)" : {{jsonize(cells["Provenance (Production)"].value)}}, "Provenance (History)" : {{jsonize(cells["Provenance (History)"].value)}}, "Categories" : {{jsonize(cells["Categories"].value)}}, "Persistent Link" : {{jsonize(cells["Persistent Link"].value)}}, "Height" : {{jsonize(cells["Height"].value)}}, "Width" : {{jsonize(cells["Width"].value)}}, "Depth" : {{jsonize(cells["Depth"].value)}}, "Diameter" : {{jsonize(cells["Diameter"].value)}}, "Weight" : {{jsonize(cells["Weight"].value)}}, "License info" : {{jsonize(cells["License info"].value)}} }' -t rowSeparator=',' -t prefix='{ "rows" : [ ' -t suffix='] }' -t splitToFiles=true
|
||||||
EOF
|
EOF
|
||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
|
@ -124,7 +140,7 @@ if [ "$NUMARGS" -eq 0 ]; then
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# get user input
|
# get user input
|
||||||
options="a:b:c:d:e:f:i:m:p:ERXh"
|
options="a:b:c:d:e:f:i:m:p:t:ERXh"
|
||||||
while getopts $options opt; do
|
while getopts $options opt; do
|
||||||
case $opt in
|
case $opt in
|
||||||
a ) inputdir=$(readlink -f ${OPTARG}); if [ -n "${inputdir// }" ] ; then inputfiles=($(find -L "${inputdir}"/* -type f -printf "%f\n" 2>/dev/null)); fi ;;
|
a ) inputdir=$(readlink -f ${OPTARG}); if [ -n "${inputdir// }" ] ; then inputfiles=($(find -L "${inputdir}"/* -type f -printf "%f\n" 2>/dev/null)); fi ;;
|
||||||
|
@ -136,6 +152,7 @@ while getopts $options opt; do
|
||||||
i ) inputoptions+=("--${OPTARG}") ;;
|
i ) inputoptions+=("--${OPTARG}") ;;
|
||||||
m ) ram=${OPTARG} ;;
|
m ) ram=${OPTARG} ;;
|
||||||
p ) port=${OPTARG} ;;
|
p ) port=${OPTARG} ;;
|
||||||
|
t ) templating+=("--${OPTARG}") ; exportformat="txt" ;;
|
||||||
E ) export="false" ;;
|
E ) export="false" ;;
|
||||||
R ) restarttransform="false" ;;
|
R ) restarttransform="false" ;;
|
||||||
X ) restartfile="false" ;;
|
X ) restartfile="false" ;;
|
||||||
|
@ -191,6 +208,7 @@ echo "OpenRefine port: $port"
|
||||||
echo "OpenRefine workspace: $outputdir"
|
echo "OpenRefine workspace: $outputdir"
|
||||||
echo "Export to workspace: $export"
|
echo "Export to workspace: $export"
|
||||||
echo "Export format: $exportformat"
|
echo "Export format: $exportformat"
|
||||||
|
echo "Templating options: ${templating[*]}"
|
||||||
echo "restart after file: $restartfile"
|
echo "restart after file: $restartfile"
|
||||||
echo "restart after transform: $restarttransform"
|
echo "restart after transform: $restarttransform"
|
||||||
echo ""
|
echo ""
|
||||||
|
@ -241,7 +259,7 @@ if [ -n "$inputfiles" ]; then
|
||||||
for inputfile in "${inputfiles[@]}" ; do
|
for inputfile in "${inputfiles[@]}" ; do
|
||||||
echo "import ${inputfile}..."
|
echo "import ${inputfile}..."
|
||||||
# run client with input command
|
# run client with input command
|
||||||
openrefine-client/openrefine-client_0-3-1_linux-64bit -P ${port} -c ${inputdir}/${inputfile} $inputformat "${inputoptions[@]}"
|
openrefine-client/openrefine-client_0-3-4_linux-64bit -P ${port} -c ${inputdir}/${inputfile} $inputformat "${inputoptions[@]}"
|
||||||
# show allocated system resources
|
# show allocated system resources
|
||||||
ps -o start,etime,%mem,%cpu,rss -p ${pid} --sort=start
|
ps -o start,etime,%mem,%cpu,rss -p ${pid} --sort=start
|
||||||
memoryload+=($(ps --no-headers -o rss -p ${pid}))
|
memoryload+=($(ps --no-headers -o rss -p ${pid}))
|
||||||
|
@ -272,7 +290,7 @@ if [ -n "$jsonfiles" ] || [ "$export" = "true" ]; then
|
||||||
|
|
||||||
# get project ids
|
# get project ids
|
||||||
echo "get project ids..."
|
echo "get project ids..."
|
||||||
openrefine-client/openrefine-client_0-3-1_linux-64bit -P ${port} -l > "${outputdir}/projects.tmp"
|
openrefine-client/openrefine-client_0-3-4_linux-64bit -P ${port} -l > "${outputdir}/projects.tmp"
|
||||||
projectids=($(cut -c 2-14 "${outputdir}/projects.tmp"))
|
projectids=($(cut -c 2-14 "${outputdir}/projects.tmp"))
|
||||||
projectnames=($(cut -c 17- "${outputdir}/projects.tmp"))
|
projectnames=($(cut -c 17- "${outputdir}/projects.tmp"))
|
||||||
cat "${outputdir}/projects.tmp" && rm "${outputdir:?}/projects.tmp"
|
cat "${outputdir}/projects.tmp" && rm "${outputdir:?}/projects.tmp"
|
||||||
|
@ -309,7 +327,7 @@ if [ -n "$jsonfiles" ] || [ "$export" = "true" ]; then
|
||||||
for jsonfile in "${jsonfiles[@]}" ; do
|
for jsonfile in "${jsonfiles[@]}" ; do
|
||||||
echo "transform ${jsonfile}..."
|
echo "transform ${jsonfile}..."
|
||||||
# run client with apply command
|
# run client with apply command
|
||||||
openrefine-client/openrefine-client_0-3-1_linux-64bit -P ${port} -f ${configdir}/${jsonfile} ${projectids[i]}
|
openrefine-client/openrefine-client_0-3-4_linux-64bit -P ${port} -f ${configdir}/${jsonfile} ${projectids[i]}
|
||||||
# allocated system resources
|
# allocated system resources
|
||||||
ps -o start,etime,%mem,%cpu,rss -p ${pid} --sort=start
|
ps -o start,etime,%mem,%cpu,rss -p ${pid} --sort=start
|
||||||
memoryload+=($(ps --no-headers -o rss -p ${pid}))
|
memoryload+=($(ps --no-headers -o rss -p ${pid}))
|
||||||
|
@ -341,7 +359,7 @@ if [ -n "$jsonfiles" ] || [ "$export" = "true" ]; then
|
||||||
filename=${projectnames[i]%.*}
|
filename=${projectnames[i]%.*}
|
||||||
echo "export to file ${filename}.${exportformat}..."
|
echo "export to file ${filename}.${exportformat}..."
|
||||||
# run client with export command
|
# run client with export command
|
||||||
openrefine-client/openrefine-client_0-3-1_linux-64bit -P ${port} -E --output="${outputdir}/${filename}.${exportformat}" ${projectids[i]}
|
openrefine-client/openrefine-client_0-3-4_linux-64bit -P ${port} -E --output="${outputdir}/${filename}.${exportformat}" "${templating[@]}" ${projectids[i]}
|
||||||
# show allocated system resources
|
# show allocated system resources
|
||||||
ps -o start,etime,%mem,%cpu,rss -p ${pid} --sort=start
|
ps -o start,etime,%mem,%cpu,rss -p ${pid} --sort=start
|
||||||
memoryload+=($(ps --no-headers -o rss -p ${pid}))
|
memoryload+=($(ps --no-headers -o rss -p ${pid}))
|
||||||
|
|
Binary file not shown.
Loading…
Reference in New Issue