Merge pull request #113 from opencultureconsulting:56-search

56-search
This commit is contained in:
Felix Lohmeier 2023-10-28 13:54:01 +02:00 committed by GitHub
commit 603e86c60d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 321 additions and 2 deletions

View File

@ -11,6 +11,7 @@
- [info](info.md) - [info](info.md)
- [list](list.md) - [list](list.md)
- [run](run.md) - [run](run.md)
- [search](search.md)
- [test](test.md) - [test](test.md)
- [transform](transform.md) - [transform](transform.md)
@ -30,6 +31,7 @@ Commands:
import commands to create OpenRefine projects from files or URLs import commands to create OpenRefine projects from files or URLs
list list projects on OpenRefine server list list projects on OpenRefine server
info show OpenRefine project's metadata info show OpenRefine project's metadata
search apply regex to each column and print matches in flattened tsv format
test run functional tests on tmp OpenRefine workspace test run functional tests on tmp OpenRefine workspace
transform apply undo/redo JSON file(s) to an OpenRefine project transform apply undo/redo JSON file(s) to an OpenRefine project
export commands to export data from OpenRefine projects to files export commands to export data from OpenRefine projects to files
@ -52,6 +54,7 @@ Examples:
orcli list orcli list
orcli info "duplicates" orcli info "duplicates"
orcli transform "duplicates" "https://git.io/fj5ju" orcli transform "duplicates" "https://git.io/fj5ju"
orcli search "duplicates" "^Ben"
orcli export tsv "duplicates" orcli export tsv "duplicates"
orcli export tsv "duplicates" --output "duplicates.tsv" orcli export tsv "duplicates" --output "duplicates.tsv"
orcli delete "duplicates" orcli delete "duplicates"

35
help/search.md Normal file
View File

@ -0,0 +1,35 @@
# orcli search
```
orcli search
apply regex to each column and print matches in flattened tsv format
output: index column value
Usage:
orcli search PROJECT [REGEX] [OPTIONS]
orcli search --help | -h
Options:
--index COLUMN
print column values instead of row.index in the first column of the output
--help, -h
Show this help
Arguments:
PROJECT
project name or id
REGEX
search term (regular expression, case-sensitive)
Examples:
orcli search "duplicates" "^Ben"
orcli search 1234567890123 "^Ben"
orcli search "duplicates" "^F$" --index "email"
orcli search "duplicates" | column -t -s $' '
```
code: [src/search_command.sh](../src/search_command.sh)

189
orcli
View File

@ -39,6 +39,7 @@ orcli_usage() {
printf " %s commands to create OpenRefine projects from files or URLs\n" "import " printf " %s commands to create OpenRefine projects from files or URLs\n" "import "
printf " %s list projects on OpenRefine server\n" "list " printf " %s list projects on OpenRefine server\n" "list "
printf " %s show OpenRefine project's metadata\n" "info " printf " %s show OpenRefine project's metadata\n" "info "
printf " %s apply regex to each column and print matches in flattened tsv format\n" "search "
printf " %s run functional tests on tmp OpenRefine workspace\n" "test " printf " %s run functional tests on tmp OpenRefine workspace\n" "test "
printf " %s apply undo/redo JSON file(s) to an OpenRefine project\n" "transform " printf " %s apply undo/redo JSON file(s) to an OpenRefine project\n" "transform "
printf " %s commands to export data from OpenRefine projects to files\n" "export " printf " %s commands to export data from OpenRefine projects to files\n" "export "
@ -72,6 +73,7 @@ orcli_usage() {
printf " orcli list\n" printf " orcli list\n"
printf " orcli info \"duplicates\"\n" printf " orcli info \"duplicates\"\n"
printf " orcli transform \"duplicates\" \"https://git.io/fj5ju\"\n" printf " orcli transform \"duplicates\" \"https://git.io/fj5ju\"\n"
printf " orcli search \"duplicates\" \"^Ben\"\n"
printf " orcli export tsv \"duplicates\"\n" printf " orcli export tsv \"duplicates\"\n"
printf " orcli export tsv \"duplicates\" --output \"duplicates.tsv\"\n" printf " orcli export tsv \"duplicates\" --output \"duplicates.tsv\"\n"
printf " orcli delete \"duplicates\"\n" printf " orcli delete \"duplicates\"\n"
@ -563,6 +565,65 @@ orcli_info_usage() {
fi fi
} }
# :command.usage
orcli_search_usage() {
if [[ -n $long_usage ]]; then
printf "orcli search\n"
echo
printf " apply regex to each column and print matches in flattened tsv format\n output: index column value\n"
echo
else
printf "orcli search - apply regex to each column and print matches in flattened tsv format\n"
echo
fi
printf "%s\n" "Usage:"
printf " orcli search PROJECT [REGEX] [OPTIONS]\n"
printf " orcli search --help | -h\n"
echo
# :command.long_usage
if [[ -n $long_usage ]]; then
printf "%s\n" "Options:"
# :command.usage_flags
# :flag.usage
printf " %s\n" "--index COLUMN"
printf " print column values instead of row.index in the first column of the output\n"
echo
# :command.usage_fixed_flags
printf " %s\n" "--help, -h"
printf " Show this help\n"
echo
# :command.usage_args
printf "%s\n" "Arguments:"
# :argument.usage
printf " %s\n" "PROJECT"
printf " project name or id\n"
echo
# :argument.usage
printf " %s\n" "REGEX"
printf " search term (regular expression, case-sensitive)\n"
echo
# :command.usage_examples
printf "%s\n" "Examples:"
printf " orcli search \"duplicates\" \"^Ben\"\n"
printf " orcli search 1234567890123 \"^Ben\"\n"
printf " orcli search \"duplicates\" \"^F$\" --index \"email\"\n"
printf " orcli search \"duplicates\" | column -t -s \$'\t'\n"
echo
fi
}
# :command.usage # :command.usage
orcli_test_usage() { orcli_test_usage() {
if [[ -n $long_usage ]]; then if [[ -n $long_usage ]]; then
@ -1242,6 +1303,10 @@ send_completions() {
echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--help -h csv tsv")" -- "$cur" )' echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--help -h csv tsv")" -- "$cur" )'
echo $' ;;' echo $' ;;'
echo $'' echo $''
echo $' \'search\'*)'
echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--help --index -h")" -- "$cur" )'
echo $' ;;'
echo $''
echo $' \'export\'*)' echo $' \'export\'*)'
echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--help -h template tsv")" -- "$cur" )' echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--help -h template tsv")" -- "$cur" )'
echo $' ;;' echo $' ;;'
@ -1263,7 +1328,7 @@ send_completions() {
echo $' ;;' echo $' ;;'
echo $'' echo $''
echo $' *)' echo $' *)'
echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--help --version -h -v completions delete export import info list run test transform")" -- "$cur" )' echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--help --version -h -v completions delete export import info list run search test transform")" -- "$cur" )'
echo $' ;;' echo $' ;;'
echo $'' echo $''
echo $' esac' echo $' esac'
@ -1531,6 +1596,46 @@ orcli_info_command() {
} }
# :command.function
orcli_search_command() {
# src/search_command.sh
# shellcheck shell=bash disable=SC2154
# get project id
projectid="$(get_id "${args[project]}")"
# set facets config
args['--facets']='[ { "type": "list", "expression": "grel:filter(row.columnNames,cn,cells[cn].value.find(/'
args['--facets']+="${args[regex]}"
args['--facets']+='/).length()>0).length()>0", "columnName": "", "selection": [ { "v": { "v": true } } ] } ]'
# set template
template='{{'
template+='forEach(filter(row.columnNames, cn, cells[cn].value.find(/'
template+="${args[regex]}"
template+='/).length()>0), cn,'
if [[ ${args[--index]} ]]; then
template+='cells["'
template+="${args[--index]}"
template+='"].value'
else
template+='(row.index + 1)'
fi
template+='+ "\t" + cn + "\t" +'
template+='forNonBlank(cells[cn].value, v, if(v.contains(" "), if(v.contains('\''"'\''), '\''"'\'' + v.replace('\''"'\'','\''""'\'') + '\''"'\'', '\''"'\'' + v + '\''"'\''), v),"")'
template+='+ "\n")'
template+='}}'
# assemble specific post data
data+=("project=${projectid}")
data+=("format=template")
data+=("template=${template}")
# call post_export function to post data and validate results
post_export "${data[@]}"
}
# :command.function # :command.function
orcli_test_command() { orcli_test_command() {
# src/test_command.sh # src/test_command.sh
@ -1999,6 +2104,13 @@ parse_requirements() {
shift $# shift $#
;; ;;
search)
action="search"
shift
orcli_search_parse_requirements "$@"
shift $#
;;
test) test)
action="test" action="test"
shift shift
@ -2900,6 +3012,80 @@ orcli_info_parse_requirements() {
} }
# :command.parse_requirements
orcli_search_parse_requirements() {
# :command.fixed_flags_filter
while [[ $# -gt 0 ]]; do
case "${1:-}" in
--help | -h)
long_usage=yes
orcli_search_usage
exit
;;
*)
break
;;
esac
done
# :command.command_filter
action="search"
# :command.parse_requirements_while
while [[ $# -gt 0 ]]; do
key="$1"
case "$key" in
# :flag.case
--index)
# :flag.case_arg
if [[ -n ${2+x} ]]; then
args['--index']="$2"
shift
shift
else
printf "%s\n" "--index requires an argument: --index COLUMN" >&2
exit 1
fi
;;
-?*)
printf "invalid option: %s\n" "$key" >&2
exit 1
;;
*)
# :command.parse_requirements_case
# :command.parse_requirements_case_simple
if [[ -z ${args['project']+x} ]]; then
args['project']=$1
shift
elif [[ -z ${args['regex']+x} ]]; then
args['regex']=$1
shift
else
printf "invalid argument: %s\n" "$key" >&2
exit 1
fi
;;
esac
done
# :command.required_args_filter
if [[ -z ${args['project']+x} ]]; then
printf "missing required argument: PROJECT\nusage: orcli search PROJECT [REGEX] [OPTIONS]\n" >&2
exit 1
fi
}
# :command.parse_requirements # :command.parse_requirements
orcli_test_parse_requirements() { orcli_test_parse_requirements() {
# :command.fixed_flags_filter # :command.fixed_flags_filter
@ -3514,6 +3700,7 @@ run() {
"import tsv") orcli_import_tsv_command ;; "import tsv") orcli_import_tsv_command ;;
"list") orcli_list_command ;; "list") orcli_list_command ;;
"info") orcli_info_command ;; "info") orcli_info_command ;;
"search") orcli_search_command ;;
"test") orcli_test_command ;; "test") orcli_test_command ;;
"transform") orcli_transform_command ;; "transform") orcli_transform_command ;;
"export") orcli_export_command ;; "export") orcli_export_command ;;

View File

@ -17,6 +17,7 @@ examples:
- orcli list - orcli list
- orcli info "duplicates" - orcli info "duplicates"
- orcli transform "duplicates" "https://git.io/fj5ju" - orcli transform "duplicates" "https://git.io/fj5ju"
- orcli search "duplicates" "^Ben"
- orcli export tsv "duplicates" - orcli export tsv "duplicates"
- orcli export tsv "duplicates" --output "duplicates.tsv" - orcli export tsv "duplicates" --output "duplicates.tsv"
- orcli delete "duplicates" - orcli delete "duplicates"
@ -206,6 +207,25 @@ commands:
- orcli info 1234567890123 - orcli info 1234567890123
- orcli info "duplicates" | jq -r .columns[] - orcli info "duplicates" | jq -r .columns[]
- name: search
help: |-
apply regex to each column and print matches in flattened tsv format
output: index column value
args:
- *project
- &regex
name: regex
help: search term (regular expression, case-sensitive)
flags:
- long: --index
help: print column values instead of row.index in the first column of the output
arg: column
examples:
- orcli search "duplicates" "^Ben"
- orcli search 1234567890123 "^Ben"
- orcli search "duplicates" "^F$" --index "email"
- orcli search "duplicates" | column -t -s \$'\t'
- name: test - name: test
help: run functional tests on tmp OpenRefine workspace help: run functional tests on tmp OpenRefine workspace

View File

@ -66,6 +66,10 @@ send_completions() {
echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--help -h csv tsv")" -- "$cur" )' echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--help -h csv tsv")" -- "$cur" )'
echo $' ;;' echo $' ;;'
echo $'' echo $''
echo $' \'search\'*)'
echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--help --index -h")" -- "$cur" )'
echo $' ;;'
echo $''
echo $' \'export\'*)' echo $' \'export\'*)'
echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--help -h template tsv")" -- "$cur" )' echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--help -h template tsv")" -- "$cur" )'
echo $' ;;' echo $' ;;'
@ -87,7 +91,7 @@ send_completions() {
echo $' ;;' echo $' ;;'
echo $'' echo $''
echo $' *)' echo $' *)'
echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--help --version -h -v completions delete export import info list run test transform")" -- "$cur" )' echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--help --version -h -v completions delete export import info list run search test transform")" -- "$cur" )'
echo $' ;;' echo $' ;;'
echo $'' echo $''
echo $' esac' echo $' esac'

34
src/search_command.sh Normal file
View File

@ -0,0 +1,34 @@
# shellcheck shell=bash disable=SC2154
# get project id
projectid="$(get_id "${args[project]}")"
# set facets config
args['--facets']='[ { "type": "list", "expression": "grel:filter(row.columnNames,cn,cells[cn].value.find(/'
args['--facets']+="${args[regex]}"
args['--facets']+='/).length()>0).length()>0", "columnName": "", "selection": [ { "v": { "v": true } } ] } ]'
# set template
template='{{'
template+='forEach(filter(row.columnNames, cn, cells[cn].value.find(/'
template+="${args[regex]}"
template+='/).length()>0), cn,'
if [[ ${args[--index]} ]]; then
template+='cells["'
template+="${args[--index]}"
template+='"].value'
else
template+='(row.index + 1)'
fi
template+='+ "\t" + cn + "\t" +'
template+='forNonBlank(cells[cn].value, v, if(v.contains(" "), if(v.contains('\''"'\''), '\''"'\'' + v.replace('\''"'\'','\''""'\'') + '\''"'\'', '\''"'\'' + v + '\''"'\''), v),"")'
template+='+ "\n")'
template+='}}'
# assemble specific post data
data+=("project=${projectid}")
data+=("format=template")
data+=("template=${template}")
# call post_export function to post data and validate results
post_export "${data[@]}"

36
tests/search.sh Normal file
View File

@ -0,0 +1,36 @@
#!/bin/bash
t="search"
# create tmp directory
tmpdir="$(mktemp -d)"
trap '{ rm -rf "${tmpdir}"; }' 0 2 3 15
# input
cat << "DATA" > "${tmpdir}/${t}.csv"
email,name,state,gender,purchase
danny.baron@example1.com,Danny Baron,CA,M,TV
melanie.white@example2.edu,Melanie White,NC,F,iPhone
danny.baron@example1.com,D. Baron,CA,M,Winter jacket
ben.tyler@example3.org,Ben Tyler,NV,M,Flashlight
arthur.duff@example4.com,Arthur Duff,OR,M,Dining table
danny.baron@example1.com,Daniel Baron,CA,M,Bike
jean.griffith@example5.org,Jean Griffith,WA,F,Power drill
melanie.white@example2.edu,Melanie White,NC,F,iPad
ben.morisson@example6.org,Ben Morisson,FL,M,Amplifier
arthur.duff@example4.com,Arthur Duff,OR,M,Night table
DATA
# assertion
cat << "DATA" > "${tmpdir}/${t}.assert"
melanie.white@example2.edu name Melanie White
melanie.white@example2.edu name "Melanie White"
DATA
# action
cd "${tmpdir}" || exit 1
orcli import csv "${t}.csv" --projectName "${t}"
orcli search "${t}" "^Mel" --index "email" > "${t}.output"
# test
diff -u "${t}.assert" "${t}.output"