2019-08-20 04:30:50 +02:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
2019-08-22 01:43:52 +02:00
"# Test module cli in a Python 2 environment"
2019-08-20 04:30:50 +02:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Install\n",
"\n",
"This notebook requires a Python 2.7 environment and an OpenRefine server running at http://127.0.0.1:3333."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[33mDEPRECATION: Python 2.7 will reach the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 won't be maintained after that date. A future version of pip will drop support for Python 2.7. More details about Python 2 support in pip, can be found at https://pip.pypa.io/en/latest/development/release-process/#python-2-support\u001b[0m\n",
"Processing /home/felix/git/openrefine-client\n",
"Requirement already satisfied, skipping upgrade: urllib2_file in /home/felix/.local/lib/python2.7/site-packages (from openrefine-client==0.3.7) (0.2.1)\n",
"Installing collected packages: openrefine-client\n",
" Found existing installation: openrefine-client 0.3.7\n",
" Uninstalling openrefine-client-0.3.7:\n",
" Successfully uninstalled openrefine-client-0.3.7\n",
" Running setup.py install for openrefine-client ... \u001b[?25ldone\n",
"\u001b[?25hSuccessfully installed openrefine-client-0.3.7\n"
]
}
],
"source": [
"import sys\n",
"!{sys.executable} -m pip install .. --user --upgrade"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
2019-08-22 01:43:52 +02:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/tmp/tmp24HyYg\n"
]
}
],
"source": [
"import tempfile\n",
"import shutil\n",
"import os\n",
"dirpath = tempfile.mkdtemp()\n",
"shutil.copytree('data',dirpath + '/data')\n",
"print(dirpath)\n",
"os.chdir(dirpath)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
2019-08-20 04:30:50 +02:00
"outputs": [],
"source": [
2019-08-22 01:43:52 +02:00
"from google.refine import cli"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## README.md"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Download"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Download to file duplicates.csv complete\n"
]
}
],
"source": [
"cli.download('https://git.io/fj5hF','duplicates.csv')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id: 2019539621291\n",
"rows: 10\n"
]
}
],
"source": [
"p1 = cli.create('duplicates.csv')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### List"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 2019539621291: duplicates\n"
]
}
],
"source": [
"cli.ls()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Info"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" id: 2019539621291\n",
" url: http://127.0.0.1:3333/project?project=2019539621291\n",
" name: duplicates\n",
" modified: 2019-08-21T23:31:03Z\n",
" created: 2019-08-21T23:31:02Z\n",
" rowCount: 10\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'duplicates.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n",
" column 001: email\n",
" column 002: name\n",
" column 003: state\n",
" column 004: gender\n",
" column 005: purchase\n"
]
}
],
"source": [
"cli.info(p1.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Export"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"email\tname\tstate\tgender\tpurchase\n",
"danny.baron@example1.com\tDanny Baron\tCA\tM\tTV\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\tiPhone\n",
"danny.baron@example1.com\tD. Baron\tCA\tM\tWinter jacket\n",
"ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\n",
"danny.baron@example1.com\tDaniel Baron\tCA\tM\tBike\n",
"jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\tiPad\n",
"ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\n"
]
}
],
"source": [
"cli.export(p1.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Apply"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Download to file duplicates-deletion.json complete\n"
]
}
],
"source": [
"cli.download('https://git.io/fj5ju','duplicates-deletion.json')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File duplicates-deletion.json has been successfully applied to project 2019539621291\n"
]
}
],
"source": [
"cli.apply(p1.project_id, 'duplicates-deletion.json')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"email\tcount\tname\tstate\tgender\tpurchase\n",
"arthur.duff@example4.com\t2\tArthur Duff\tOR\tM\tDining table\n",
"ben.morisson@example6.org\t1\tBen Morisson\tFL\tM\tAmplifier\n",
"ben.tyler@example3.org\t1\tBen Tyler\tNV\tM\tFlashlight\n",
"danny.baron@example1.com\t3\tDanny Baron\tCA\tM\tTV\n",
"jean.griffith@example5.org\t1\tJean Griffith\tWA\tF\tPower drill\n",
"melanie.white@example2.edu\t2\tMelanie White\tNC\tF\tiPhone\n"
]
}
],
"source": [
"cli.export(p1.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Export XLS"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"email\tcount\tname\tstate\tgender\tpurchase\n",
"arthur.duff@example4.com\t2\tArthur Duff\tOR\tM\tDining table\n",
"ben.morisson@example6.org\t1\tBen Morisson\tFL\tM\tAmplifier\n",
"ben.tyler@example3.org\t1\tBen Tyler\tNV\tM\tFlashlight\n",
"danny.baron@example1.com\t3\tDanny Baron\tCA\tM\tTV\n",
"jean.griffith@example5.org\t1\tJean Griffith\tWA\tF\tPower drill\n",
"melanie.white@example2.edu\t2\tMelanie White\tNC\tF\tiPhone\n"
]
}
],
"source": [
"cli.export(p1.project_id, 'deduped.xls')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Delete"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Project 2019539621291 has been successfully deleted\n"
]
}
],
"source": [
"cli.delete(p1.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Templating"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id: 1716843473792\n",
"rows: 10\n"
]
}
],
"source": [
"p2 = cli.create('duplicates.csv')"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{ \"events\" : [\n",
" { \"name\" : \"Melanie White\", \"purchase\" : \"iPhone\" },\n",
" { \"name\" : \"Jean Griffith\", \"purchase\" : \"Power drill\" },\n",
" { \"name\" : \"Melanie White\", \"purchase\" : \"iPad\" }\n",
"] }"
]
}
],
"source": [
"cli.templating(p2.project_id,\n",
"prefix='''{ \"events\" : [\n",
"''',\n",
"template=' { \"name\" : {{jsonize(cells[\"name\"].value)}}, \"purchase\" : {{jsonize(cells[\"purchase\"].value)}} }',\n",
"rowSeparator=''',\n",
"''',\n",
"suffix='''\n",
"] }''',\n",
"filterQuery='^F$',\n",
"filterColumn='gender')"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Export to files complete. Last file: advanced_3.json\n"
]
}
],
"source": [
"cli.templating(p2.project_id,\n",
"prefix='''{ \"events\" : [\n",
"''',\n",
"template=' { \"name\" : {{jsonize(cells[\"name\"].value)}}, \"purchase\" : {{jsonize(cells[\"purchase\"].value)}} }',\n",
"rowSeparator=''',\n",
"''',\n",
"suffix='''\n",
"] }''',\n",
"filterQuery='^F$',\n",
"filterColumn='gender',\n",
"output_file='advanced.json',\n",
"splitToFiles=True)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Export to files complete. Last file: advanced_melanie.white@example2.edu.json\n"
]
}
],
"source": [
"cli.templating(p2.project_id,\n",
"prefix='''{ \"events\" : [\n",
"''',\n",
"template=' { \"name\" : {{jsonize(cells[\"name\"].value)}}, \"purchase\" : {{jsonize(cells[\"purchase\"].value)}} }',\n",
"rowSeparator=''',\n",
"''',\n",
"suffix='''\n",
"] }''',\n",
"filterQuery='^F$',\n",
"filterColumn='gender',\n",
"output_file='advanced.json',\n",
"splitToFiles=True,\n",
"suffixById=True)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['advanced_jean.griffith@example5.org.json',\n",
" 'advanced_melanie.white@example2.edu.json',\n",
" 'advanced_3.json',\n",
" 'advanced_2.json',\n",
" 'advanced_1.json',\n",
" 'duplicates-deletion.json',\n",
" 'duplicates.csv',\n",
" 'data']"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"os.listdir(os.getcwd())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Delete"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Project 1716843473792 has been successfully deleted\n"
]
}
],
"source": [
"cli.delete(p2.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Unicode"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### fruits"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id: 1929957235590\n",
"rows: 5\n",
" id: 1929957235590\n",
" url: http://127.0.0.1:3333/project?project=1929957235590\n",
" name: evil-fruits\n",
" modified: 2019-08-21T23:35:47Z\n",
" created: 2019-08-21T23:35:47Z\n",
" rowCount: 5\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/evil-fruits.tsv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'evil-fruits', u'processQuotes': True, u'limit': -1, u'trimStrings': False, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n",
" column 001: 🔣\n",
" column 002: code\n",
" column 003: meaning\n",
"🔣\tcode\tmeaning\n",
"🍇\t1F347\tGRAPES\n",
"🍉\t1F349\tWATERMELON\n",
"🍒\t1F352\tCHERRIES\n",
"🍓\t1F353\tSTRAWBERRY\n",
"🍍\t1F34D\tPINEAPPLE\n"
]
}
],
"source": [
"p1 = cli.create('data/cli/evil-fruits.tsv')\n",
"cli.info(p1.project_id)\n",
"cli.export(p1.project_id)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Export to file emojis.csv complete\n",
"🔣,code,meaning\n",
"🍇,1F347,GRAPES\n",
"🍉,1F349,WATERMELON\n",
"🍒,1F352,CHERRIES\n",
"🍓,1F353,STRAWBERRY\n",
"🍍,1F34D,PINEAPPLE\n",
"\n"
]
}
],
"source": [
"cli.export(p1.project_id, output_file='emojis.csv')\n",
"with open('emojis.csv', 'r') as f:\n",
" print(f.read())"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{ \"emojis\" : [\n",
" { \"symbol\" : \"🍇\", \"meaning\" : \"GRAPES\" },\n",
" { \"symbol\" : \"🍉\", \"meaning\" : \"WATERMELON\" },\n",
" { \"symbol\" : \"🍍\", \"meaning\" : \"PINEAPPLE\" }\n",
"] }"
]
}
],
"source": [
"cli.templating(p1.project_id,\n",
"prefix='''{ \"emojis\" : [\n",
"''',\n",
"template=' { \"symbol\" : {{jsonize(with(row.columnNames[0],cn,cells[cn].value))}}, \"meaning\" : {{jsonize(cells[\"meaning\"].value)}} }',\n",
"rowSeparator=''',\n",
"''',\n",
"suffix='''\n",
"] }''',\n",
"filterQuery='^1F34',\n",
"filterColumn='code')"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Export to files complete. Last file: trái cây_3.json\n"
]
}
],
"source": [
"cli.templating(p1.project_id,\n",
"prefix='''{ \"emojis\" : [\n",
"''',\n",
"template=' { \"symbol\" : {{jsonize(with(row.columnNames[0],cn,cells[cn].value))}}, \"meaning\" : {{jsonize(cells[\"meaning\"].value)}} }',\n",
"rowSeparator=''',\n",
"''',\n",
"suffix='''\n",
"] }''',\n",
"filterQuery='^1F34',\n",
"filterColumn='code',\n",
"output_file='trái cây.json',\n",
"splitToFiles=True)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Export to files complete. Last file: trái cây_🍍.json\n"
]
}
],
"source": [
"cli.templating(p1.project_id,\n",
"prefix='''{ \"emojis\" : [\n",
"''',\n",
"template=' { \"symbol\" : {{jsonize(with(row.columnNames[0],cn,cells[cn].value))}}, \"meaning\" : {{jsonize(cells[\"meaning\"].value)}} }',\n",
"rowSeparator=''',\n",
"''',\n",
"suffix='''\n",
"] }''',\n",
"filterQuery='^1F34',\n",
"filterColumn='code',\n",
"output_file='trái cây.json',\n",
"splitToFiles=True,\n",
"suffixById=True)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['tr\\xc3\\xa1i c\\xc3\\xa2y_\\xf0\\x9f\\x8d\\x8d.json',\n",
" 'tr\\xc3\\xa1i c\\xc3\\xa2y_\\xf0\\x9f\\x8d\\x89.json',\n",
" 'tr\\xc3\\xa1i c\\xc3\\xa2y_\\xf0\\x9f\\x8d\\x87.json',\n",
" 'tr\\xc3\\xa1i c\\xc3\\xa2y_3.json',\n",
" 'tr\\xc3\\xa1i c\\xc3\\xa2y_2.json',\n",
" 'tr\\xc3\\xa1i c\\xc3\\xa2y_1.json',\n",
" 'emojis.csv',\n",
" 'advanced_jean.griffith@example5.org.json',\n",
" 'advanced_melanie.white@example2.edu.json',\n",
" 'advanced_3.json',\n",
" 'advanced_2.json',\n",
" 'advanced_1.json',\n",
" 'duplicates-deletion.json',\n",
" 'duplicates.csv',\n",
" 'data']"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"os.listdir(os.getcwd())"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Project 2401578251107 has been successfully deleted\n"
]
}
],
"source": [
"cli.delete(p1.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### emoji data"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id: 2314250240290\n",
"rows: 20\n",
" id: 2314250240290\n",
" url: http://127.0.0.1:3333/project?project=2314250240290\n",
" name: dữ liệu biểu tượng cảm xúc\n",
" modified: 2019-08-21T23:36:05Z\n",
" created: 2019-08-21T23:36:05Z\n",
" rowCount: 20\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/d\\u1eef li\\u1ec7u bi\\u1ec3u t\\u01b0\\u1ee3ng c\\u1ea3m x\\xfac.txt', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'd\\u1eef li\\u1ec7u bi\\u1ec3u t\\u01b0\\u1ee3ng c\\u1ea3m x\\xfac', u'processQuotes': True, u'skipDataLines': 34, u'limit': 20, u'trimStrings': False, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False, u'headerLines': 0}]\n",
" column 001: Column 1\n",
" column 002: Column 2\n",
" column 003: Column 3\n",
" column 004: Column 4\n",
" column 005: Column 5\n",
" column 006: Column 6\n",
"Column 1\tColumn 2\tColumn 3\tColumn 4\tColumn 5\tColumn 6\n",
"00A9 ;\ttext ;\tL1 ;\tnone ;\tj\t# V1.1 (©) COPYRIGHT SIGN\n",
"00AE ;\ttext ;\tL1 ;\tnone ;\tj\t# V1.1 (®) REGISTERED SIGN\n",
"203C ;\ttext ;\tL1 ;\tnone ;\ta j\t# V1.1 (‼) DOUBLE EXCLAMATION MARK\n",
"2049 ;\ttext ;\tL1 ;\tnone ;\ta j\t# V3.0 (⁉) EXCLAMATION QUESTION MARK\n",
"2122 ;\ttext ;\tL1 ;\tnone ;\tj\t# V1.1 (™) TRADE MARK SIGN\n",
"2139 ;\ttext ;\tL1 ;\tnone ;\tj\t# V3.0 (ℹ ) INFORMATION SOURCE\n",
"2194 ;\ttext ;\tL1 ;\tnone ;\tz j\t# V1.1 (↔) LEFT RIGHT ARROW\n",
"2195 ;\ttext ;\tL1 ;\tnone ;\tz j\t# V1.1 (↕) UP DOWN ARROW\n",
"2196 ;\ttext ;\tL1 ;\tnone ;\tj\t# V1.1 (↖) NORTH WEST ARROW\n",
"2197 ;\ttext ;\tL1 ;\tnone ;\tj\t# V1.1 (↗) NORTH EAST ARROW\n",
"2198 ;\ttext ;\tL1 ;\tnone ;\tj\t# V1.1 (↘) SOUTH EAST ARROW\n",
"2199 ;\ttext ;\tL1 ;\tnone ;\tj\t# V1.1 (↙) SOUTH WEST ARROW\n",
"21A9 ;\ttext ;\tL1 ;\tnone ;\tj\t# V1.1 (↩) LEFTWARDS ARROW WITH HOOK\n",
"21AA ;\ttext ;\tL1 ;\tnone ;\tj\t# V1.1 (↪) RIGHTWARDS ARROW WITH HOOK\n",
"231A ;\temoji ;\tL1 ;\tnone ;\tj\t# V1.1 (⌚) WATCH\n",
"231B ;\temoji ;\tL1 ;\tnone ;\tj\t# V1.1 (⌛) HOURGLASS\n",
"2328 ;\ttext ;\tL2 ;\tnone ;\tx\t# V1.1 (⌨) KEYBOARD\n",
"23CF ;\ttext ;\tL2 ;\tnone ;\tx\t# V4.0 (⏏) EJECT SYMBOL\n",
"23E9 ;\temoji ;\tL1 ;\tnone ;\tj w\t# V6.0 (⏩) BLACK RIGHT-POINTING DOUBLE TRIANGLE\n",
"23EA ;\temoji ;\tL1 ;\tnone ;\tj w\t# V6.0 (⏪) BLACK LEFT-POINTING DOUBLE TRIANGLE\n"
]
}
],
"source": [
"p1 = cli.create('data/cli/dữ liệu biểu tượng cảm xúc.txt',\n",
" project_format='tsv',\n",
" headerLines=0,\n",
" skipDataLines=34,\n",
" limit=20)\n",
"cli.info(p1.project_id)\n",
"cli.export(p1.project_id)"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 2314250240290: dữ liệu biểu tượng cảm xúc\n",
" 1929957235590: evil-fruits\n"
]
}
],
"source": [
"cli.ls()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Delete"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Project 1602939526221 has been successfully deleted\n"
]
}
],
"source": [
"cli.delete(p1.project_id)"
2019-08-20 04:30:50 +02:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## CSV"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### default"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": 30,
2019-08-20 04:30:50 +02:00
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2019-08-22 01:43:52 +02:00
"id: 1675776970201\n",
2019-08-20 04:30:50 +02:00
"rows: 10\n",
2019-08-22 01:43:52 +02:00
" id: 1675776970201\n",
" url: http://127.0.0.1:3333/project?project=1675776970201\n",
2019-08-20 04:30:50 +02:00
" name: duplicates\n",
2019-08-22 01:43:52 +02:00
" modified: 2019-08-21T23:31:05Z\n",
" created: 2019-08-21T23:31:05Z\n",
2019-08-20 04:30:50 +02:00
" rowCount: 10\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n",
" column 001: email\n",
" column 002: name\n",
" column 003: state\n",
" column 004: gender\n",
" column 005: purchase\n",
" column 006: count\n",
" column 007: date\n",
"email\tname\tstate\tgender\tpurchase\tcount\tdate\n",
"danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: 📺)\t1\tWed, 4 Jul 2001\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\t<iPhone>\t1\t2001-07-04T12:08:56\n",
"danny.baron@example1.com\t\" D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n",
"ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n",
"danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n",
"jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n",
"ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n",
2019-08-22 01:43:52 +02:00
"Project 1675776970201 has been successfully deleted\n"
2019-08-20 04:30:50 +02:00
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.csv')\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### encoding\n",
"\n",
"check TV symbol in line 1"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": 31,
2019-08-20 04:30:50 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2019-08-22 01:43:52 +02:00
"id: 2268199900543\n",
2019-08-20 04:30:50 +02:00
"rows: 10\n",
"email\tname\tstate\tgender\tpurchase\tcount\tdate\n",
"danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ðº)\t1\tWed, 4 Jul 2001\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\t<iPhone>\t1\t2001-07-04T12:08:56\n",
"danny.baron@example1.com\t\" D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n",
"ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n",
"danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n",
"jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n",
"ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n",
2019-08-22 01:43:52 +02:00
"Project 2268199900543 has been successfully deleted\n"
2019-08-20 04:30:50 +02:00
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.csv', encoding='ISO-8859-1')\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": 32,
2019-08-20 04:30:50 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2019-08-22 01:43:52 +02:00
"id: 1798292162864\n",
2019-08-20 04:30:50 +02:00
"rows: 10\n",
"email\tname\tstate\tgender\tpurchase\tcount\tdate\n",
"danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: 📺)\t1\tWed, 4 Jul 2001\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\t<iPhone>\t1\t2001-07-04T12:08:56\n",
"danny.baron@example1.com\t\" D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n",
"ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n",
"danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n",
"jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n",
"ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n",
2019-08-22 01:43:52 +02:00
"Project 1798292162864 has been successfully deleted\n"
2019-08-20 04:30:50 +02:00
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.csv', encoding='UTF-8')\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### guessCellValueTypes\n",
"\n",
"check OpenRefine GUI at url below: numbers should be green"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": 33,
2019-08-20 04:30:50 +02:00
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2019-08-22 01:43:52 +02:00
"id: 2351526371150\n",
2019-08-20 04:30:50 +02:00
"rows: 10\n",
2019-08-22 01:43:52 +02:00
" id: 2351526371150\n",
" url: http://127.0.0.1:3333/project?project=2351526371150\n",
2019-08-20 04:30:50 +02:00
" name: duplicates\n",
2019-08-22 01:43:52 +02:00
" modified: 2019-08-21T23:31:05Z\n",
" created: 2019-08-21T23:31:05Z\n",
2019-08-20 04:30:50 +02:00
" rowCount: 10\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': True, u'includeFileSources': False}]\n",
" column 001: email\n",
" column 002: name\n",
" column 003: state\n",
" column 004: gender\n",
" column 005: purchase\n",
" column 006: count\n",
" column 007: date\n"
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.csv', guessCellValueTypes=True)\n",
"cli.info(p.project_id)"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": 34,
2019-08-20 04:30:50 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2019-08-22 01:43:52 +02:00
"Project 2351526371150 has been successfully deleted\n"
2019-08-20 04:30:50 +02:00
]
}
],
"source": [
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### headerLines\n",
"\n",
"check column names, should be Column 1..."
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": 35,
2019-08-20 04:30:50 +02:00
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2019-08-22 01:43:52 +02:00
"id: 1753036694840\n",
2019-08-20 04:30:50 +02:00
"rows: 11\n",
"Column 1\tColumn 2\tColumn 3\tColumn 4\tColumn 5\tColumn 6\tColumn 7\n",
"email\tname\tstate\tgender\tpurchase\tcount\tdate\n",
"danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: 📺)\t1\tWed, 4 Jul 2001\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\t<iPhone>\t1\t2001-07-04T12:08:56\n",
"danny.baron@example1.com\t\" D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n",
"ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n",
"danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n",
"jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n",
"ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n",
2019-08-22 01:43:52 +02:00
"Project 1753036694840 has been successfully deleted\n"
2019-08-20 04:30:50 +02:00
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.csv', headerLines=0)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### ignoreLines\n",
"\n",
"check column names, should start with arthur.duff as header"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": 36,
2019-08-20 04:30:50 +02:00
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2019-08-22 01:43:52 +02:00
"id: 1567779238383\n",
2019-08-20 04:30:50 +02:00
"rows: 5\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n",
"danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n",
"jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n",
"ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n",
2019-08-22 01:43:52 +02:00
"Project 1567779238383 has been successfully deleted\n"
2019-08-20 04:30:50 +02:00
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.csv', ignoreLines=5)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### limit\n",
"\n",
"should contain 5 rows"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": 37,
2019-08-20 04:30:50 +02:00
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2019-08-22 01:43:52 +02:00
"id: 2236287775552\n",
2019-08-20 04:30:50 +02:00
"rows: 5\n",
"email\tname\tstate\tgender\tpurchase\tcount\tdate\n",
"danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: 📺)\t1\tWed, 4 Jul 2001\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\t<iPhone>\t1\t2001-07-04T12:08:56\n",
"danny.baron@example1.com\t\" D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n",
"ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n",
2019-08-22 01:43:52 +02:00
"Project 2236287775552 has been successfully deleted\n"
2019-08-20 04:30:50 +02:00
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.csv', limit=5)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### separator and processQuotes\n",
"\n",
"should contain 10 rows and 2 columns (Column 2)"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": 38,
2019-08-20 04:30:50 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2019-08-22 01:43:52 +02:00
"id: 2493837924937\n",
2019-08-20 04:30:50 +02:00
"rows: 10\n",
"email,name,state,gender,purchase,count,date\tColumn 2\n",
"\"danny.baron@example1.com,Danny Baron,CA,M,TV (UTF-8: 📺),1,\"\"Wed, 4 Jul 2001\"\t\n",
"melanie.white@example2.edu,Melanie White,NC,F,<iPhone>,1,2001-07-04T12:08:56\t\n",
"danny.baron@example1.com, D.\t\"(\"\"Tab\"\") Baron,CA,M,Winter jacket,1,2001-07-04\"\n",
"ben.tyler@example3.org,Ben Tyler,NV,M,Flashlight,1,2001/07/04\t\n",
"arthur.duff@example4.com,Arthur Duff,OR,M,Dining table,1,2001-07\t\n",
"danny.baron@example1.com,Daniel Baron,,,Bike,1,2001\t\n",
"jean.griffith@example5.org,Jean Griffith,WA,F,Power drill,1,2000\t\n",
"melanie.white@example2.edu,Melanie White,NC,F,'iPad',1,1999\t\n",
"ben.morisson@example6.org,Ben Morisson,FL,M,Amplifier,1,1998\t\n",
"arthur.duff@example4.com,Arthur Duff,OR,M,Night table,1,1997\t\n",
2019-08-22 01:43:52 +02:00
"Project 2493837924937 has been successfully deleted\n"
2019-08-20 04:30:50 +02:00
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.csv', separator=' ', processQuotes=False)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### projectName"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": 39,
2019-08-20 04:30:50 +02:00
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2019-08-22 01:43:52 +02:00
"id: 1568868311685\n",
2019-08-20 04:30:50 +02:00
"rows: 10\n",
2019-08-22 01:43:52 +02:00
" id: 1568868311685\n",
" url: http://127.0.0.1:3333/project?project=1568868311685\n",
2019-08-20 04:30:50 +02:00
" name: foo\n",
2019-08-22 01:43:52 +02:00
" modified: 2019-08-21T23:31:06Z\n",
" created: 2019-08-21T23:31:06Z\n",
2019-08-20 04:30:50 +02:00
" rowCount: 10\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'foo', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n",
" column 001: email\n",
" column 002: name\n",
" column 003: state\n",
" column 004: gender\n",
" column 005: purchase\n",
" column 006: count\n",
" column 007: date\n",
2019-08-22 01:43:52 +02:00
"Project 1568868311685 has been successfully deleted\n"
2019-08-20 04:30:50 +02:00
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.csv', projectName='foo')\n",
"cli.info(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### projectTags (introduced in OpenRefine 2.8)\n",
"\n",
"check manually at http://127.0.0.1:3333 > Open Project if tags where stored"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": 40,
2019-08-20 04:30:50 +02:00
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2019-08-22 01:43:52 +02:00
"id: 1889306695897\n",
2019-08-20 04:30:50 +02:00
"rows: 10\n",
2019-08-22 01:43:52 +02:00
" id: 1889306695897\n",
" url: http://127.0.0.1:3333/project?project=1889306695897\n",
2019-08-20 04:30:50 +02:00
" name: duplicates\n",
" tags: [u'client1', u'beta']\n",
2019-08-22 01:43:52 +02:00
" modified: 2019-08-21T23:31:06Z\n",
" created: 2019-08-21T23:31:06Z\n",
2019-08-20 04:30:50 +02:00
" rowCount: 10\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'projectTags': [u'client1', u'beta'], u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n",
" column 001: email\n",
" column 002: name\n",
" column 003: state\n",
" column 004: gender\n",
" column 005: purchase\n",
" column 006: count\n",
" column 007: date\n"
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.csv', projectTags=['client1', 'beta'])\n",
"cli.info(p.project_id)"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": 41,
2019-08-20 04:30:50 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2019-08-22 01:43:52 +02:00
"Project 1889306695897 has been successfully deleted\n"
2019-08-20 04:30:50 +02:00
]
}
],
"source": [
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### skipDataLines\n",
"\n",
"should contain 5 rows"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": 42,
2019-08-20 04:30:50 +02:00
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2019-08-22 01:43:52 +02:00
"id: 1906416549071\n",
2019-08-20 04:30:50 +02:00
"rows: 5\n",
"email\tname\tstate\tgender\tpurchase\tcount\tdate\n",
"danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n",
"jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n",
"ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n",
2019-08-22 01:43:52 +02:00
"Project 1906416549071 has been successfully deleted\n"
2019-08-20 04:30:50 +02:00
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.csv', skipDataLines=5)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### storeBlankCellsAsNulls\n",
"\n",
"check OpenRefine GUI at url below:\n",
"* All > View > Show/Hide 'null' values in cells\n",
"* row 6 should contain null values in columns state and gender"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": 43,
2019-08-20 04:30:50 +02:00
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2019-08-22 01:43:52 +02:00
"id: 1641203332364\n",
2019-08-20 04:30:50 +02:00
"rows: 10\n",
2019-08-22 01:43:52 +02:00
" id: 1641203332364\n",
" url: http://127.0.0.1:3333/project?project=1641203332364\n",
2019-08-20 04:30:50 +02:00
" name: duplicates\n",
2019-08-22 01:43:52 +02:00
" modified: 2019-08-21T23:31:06Z\n",
" created: 2019-08-21T23:31:06Z\n",
2019-08-20 04:30:50 +02:00
" rowCount: 10\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': True, u'includeFileSources': False}]\n",
" column 001: email\n",
" column 002: name\n",
" column 003: state\n",
" column 004: gender\n",
" column 005: purchase\n",
" column 006: count\n",
" column 007: date\n"
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.csv', guessCellValueTypes=True)\n",
"cli.info(p.project_id)"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": 44,
2019-08-20 04:30:50 +02:00
"metadata": {
2019-08-22 01:43:52 +02:00
"scrolled": false
2019-08-20 04:30:50 +02:00
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2019-08-22 01:43:52 +02:00
"Project 1641203332364 has been successfully deleted\n"
2019-08-20 04:30:50 +02:00
]
}
],
"source": [
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## TSV"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### default"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": 45,
2019-08-20 04:30:50 +02:00
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2019-08-22 01:43:52 +02:00
"id: 2332414205165\n",
2019-08-20 04:30:50 +02:00
"rows: 10\n",
2019-08-22 01:43:52 +02:00
" id: 2332414205165\n",
" url: http://127.0.0.1:3333/project?project=2332414205165\n",
2019-08-20 04:30:50 +02:00
" name: duplicates\n",
2019-08-22 01:43:52 +02:00
" modified: 2019-08-21T23:31:06Z\n",
" created: 2019-08-21T23:31:06Z\n",
2019-08-20 04:30:50 +02:00
" rowCount: 10\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.tsv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'limit': -1, u'trimStrings': False, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n",
" column 001: email\n",
" column 002: name\n",
" column 003: state\n",
" column 004: gender\n",
" column 005: purchase\n",
" column 006: count\n",
" column 007: date\n",
"email\tname\tstate\tgender\tpurchase\tcount\tdate\n",
"danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: 📺)\t1\tWed, 4 Jul 2001\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\t<iPhone>\t1\t2001-07-04T12:08:56\n",
"danny.baron@example1.com\t\"D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n",
"ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n",
"danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n",
"jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n",
"ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n",
2019-08-22 01:43:52 +02:00
"Project 2332414205165 has been successfully deleted\n"
2019-08-20 04:30:50 +02:00
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.tsv')\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## JSON"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### default"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": 46,
2019-08-20 04:30:50 +02:00
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2019-08-22 01:43:52 +02:00
"id: 1978993820770\n",
2019-08-20 04:30:50 +02:00
"rows: 10\n",
2019-08-22 01:43:52 +02:00
" id: 1978993820770\n",
" url: http://127.0.0.1:3333/project?project=1978993820770\n",
2019-08-20 04:30:50 +02:00
" name: duplicates\n",
2019-08-22 01:43:52 +02:00
" modified: 2019-08-21T23:31:06Z\n",
" created: 2019-08-21T23:31:06Z\n",
2019-08-20 04:30:50 +02:00
" rowCount: 10\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.json', u'storeBlankRows': True, u'encoding': u'', u'recordPath': [u'_', u'_'], u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n",
" column 001: _ - name\n",
" column 002: _ - date\n",
" column 003: _ - email\n",
" column 004: _ - state\n",
" column 005: _ - count\n",
" column 006: _ - gender\n",
" column 007: _ - purchase\n",
"_ - name\t_ - date\t_ - email\t_ - state\t_ - count\t_ - gender\t_ - purchase\n",
"Danny Baron\tWed, 4 Jul 2001\tdanny.baron@example1.com\tCA\t1\tM\tTV (UTF-8: 📺)\n",
"Melanie White\t2001-07-04T12:08:56\tmelanie.white@example2.edu\tNC\t1\tF\t<iPhone>\n",
"\" D.\t(\"\"Tab\"\") Baron\"\t2001-07-04\tdanny.baron@example1.com\tCA\t1\tM\tWinter jacket\n",
"Ben Tyler\t2001/07/04\tben.tyler@example3.org\tNV\t1\tM\tFlashlight\n",
"Arthur Duff\t2001-07\tarthur.duff@example4.com\tOR\t1\tM\tDining table\n",
"Daniel Baron\t2001\tdanny.baron@example1.com\t\t1\t\tBike\n",
"Jean Griffith\t2000\tjean.griffith@example5.org\tWA\t1\tF\tPower drill\n",
"Melanie White\t1999\tmelanie.white@example2.edu\tNC\t1\tF\t'iPad'\n",
"Ben Morisson\t1998\tben.morisson@example6.org\tFL\t1\tM\tAmplifier\n",
"Arthur Duff\t1997\tarthur.duff@example4.com\tOR\t1\tM\tNight table\n",
2019-08-22 01:43:52 +02:00
"Project 1978993820770 has been successfully deleted\n"
2019-08-20 04:30:50 +02:00
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.json')\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### trimStrings (broken, does not work in the GUI either)\n",
"\n",
"check row 3 if spaces before `D.` are deleted"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": 47,
2019-08-20 04:30:50 +02:00
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2019-08-22 01:43:52 +02:00
"id: 1892692171021\n",
2019-08-20 04:30:50 +02:00
"rows: 10\n",
2019-08-22 01:43:52 +02:00
" id: 1892692171021\n",
" url: http://127.0.0.1:3333/project?project=1892692171021\n",
2019-08-20 04:30:50 +02:00
" name: duplicates\n",
2019-08-22 01:43:52 +02:00
" modified: 2019-08-21T23:31:07Z\n",
" created: 2019-08-21T23:31:06Z\n",
2019-08-20 04:30:50 +02:00
" rowCount: 10\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.json', u'storeBlankRows': True, u'encoding': u'', u'recordPath': [u'_', u'_'], u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': True, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n",
" column 001: _ - name\n",
" column 002: _ - date\n",
" column 003: _ - email\n",
" column 004: _ - state\n",
" column 005: _ - count\n",
" column 006: _ - gender\n",
" column 007: _ - purchase\n",
"_ - name\t_ - date\t_ - email\t_ - state\t_ - count\t_ - gender\t_ - purchase\n",
"Danny Baron\tWed, 4 Jul 2001\tdanny.baron@example1.com\tCA\t1\tM\tTV (UTF-8: 📺)\n",
"Melanie White\t2001-07-04T12:08:56\tmelanie.white@example2.edu\tNC\t1\tF\t<iPhone>\n",
"\" D.\t(\"\"Tab\"\") Baron\"\t2001-07-04\tdanny.baron@example1.com\tCA\t1\tM\tWinter jacket\n",
"Ben Tyler\t2001/07/04\tben.tyler@example3.org\tNV\t1\tM\tFlashlight\n",
"Arthur Duff\t2001-07\tarthur.duff@example4.com\tOR\t1\tM\tDining table\n",
"Daniel Baron\t2001\tdanny.baron@example1.com\t\t1\t\tBike\n",
"Jean Griffith\t2000\tjean.griffith@example5.org\tWA\t1\tF\tPower drill\n",
"Melanie White\t1999\tmelanie.white@example2.edu\tNC\t1\tF\t'iPad'\n",
"Ben Morisson\t1998\tben.morisson@example6.org\tFL\t1\tM\tAmplifier\n",
"Arthur Duff\t1997\tarthur.duff@example4.com\tOR\t1\tM\tNight table\n",
2019-08-22 01:43:52 +02:00
"Project 1892692171021 has been successfully deleted\n"
2019-08-20 04:30:50 +02:00
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.json', trimStrings=True)\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### recordPath"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": 48,
2019-08-20 04:30:50 +02:00
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2019-08-22 01:43:52 +02:00
"id: 1945894618034\n",
2019-08-20 04:30:50 +02:00
"rows: 10\n",
2019-08-22 01:43:52 +02:00
" id: 1945894618034\n",
" url: http://127.0.0.1:3333/project?project=1945894618034\n",
2019-08-20 04:30:50 +02:00
" name: duplicates\n",
2019-08-22 01:43:52 +02:00
" modified: 2019-08-21T23:31:07Z\n",
" created: 2019-08-21T23:31:07Z\n",
2019-08-20 04:30:50 +02:00
" rowCount: 10\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.json', u'storeBlankRows': True, u'encoding': u'', u'recordPath': [u'_', u'_', u'purchase'], u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n",
" column 001: purchase\n",
"purchase\n",
"TV (UTF-8: 📺)\n",
"<iPhone>\n",
"Winter jacket\n",
"Flashlight\n",
"Dining table\n",
"Bike\n",
"Power drill\n",
"'iPad'\n",
"Amplifier\n",
"Night table\n",
2019-08-22 01:43:52 +02:00
"Project 1945894618034 has been successfully deleted\n"
2019-08-20 04:30:50 +02:00
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.json', recordPath=['_', '_', 'purchase'])\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### storeEmptyStrings\n",
"\n",
"default: True; set to False for null values\n",
"\n",
"check OpenRefine GUI at url below:\n",
"* All > View > Show/Hide 'null' values in cells\n",
"* row 6 should contain null values in columns state and gender"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": 49,
2019-08-20 04:30:50 +02:00
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2019-08-22 01:43:52 +02:00
"id: 2551263767214\n",
2019-08-20 04:30:50 +02:00
"rows: 10\n",
2019-08-22 01:43:52 +02:00
" id: 2551263767214\n",
" url: http://127.0.0.1:3333/project?project=2551263767214\n",
2019-08-20 04:30:50 +02:00
" name: duplicates\n",
2019-08-22 01:43:52 +02:00
" modified: 2019-08-21T23:31:07Z\n",
" created: 2019-08-21T23:31:07Z\n",
2019-08-20 04:30:50 +02:00
" rowCount: 10\n",
"importOptionMetadata: [{u'storeEmptyStrings': False, u'fileSource': u'data/cli/duplicates.json', u'storeBlankRows': True, u'encoding': u'', u'recordPath': [u'_', u'_'], u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n",
" column 001: _ - name\n",
" column 002: _ - date\n",
" column 003: _ - email\n",
" column 004: _ - count\n",
" column 005: _ - purchase\n",
" column 006: _ - state\n",
" column 007: _ - gender\n"
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.json', storeEmptyStrings=False)\n",
"cli.info(p.project_id)"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": 50,
2019-08-20 04:30:50 +02:00
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2019-08-22 01:43:52 +02:00
"Project 2551263767214 has been successfully deleted\n"
2019-08-20 04:30:50 +02:00
]
}
],
"source": [
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## XML"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### default"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": 51,
2019-08-20 04:30:50 +02:00
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2019-08-22 01:43:52 +02:00
"id: 1926835461545\n",
2019-08-20 04:30:50 +02:00
"rows: 80\n",
2019-08-22 01:43:52 +02:00
" id: 1926835461545\n",
" url: http://127.0.0.1:3333/project?project=1926835461545\n",
2019-08-20 04:30:50 +02:00
" name: duplicates\n",
2019-08-22 01:43:52 +02:00
" modified: 2019-08-21T23:31:07Z\n",
" created: 2019-08-21T23:31:07Z\n",
2019-08-20 04:30:50 +02:00
" rowCount: 80\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.xml', u'storeBlankRows': True, u'encoding': u'', u'recordPath': [u'root'], u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n",
" column 001: root\n",
" column 002: root - record\n",
" column 003: root - record - name\n",
" column 004: root - record - date\n",
" column 005: root - record - email\n",
" column 006: root - record - count\n",
" column 007: root - record - purchase\n",
" column 008: root - record - state\n",
" column 009: root - record - gender\n",
"root\troot - record\troot - record - name\troot - record - date\troot - record - email\troot - record - count\troot - record - purchase\troot - record - state\troot - record - gender\n",
"\"\n",
" \"\t\"\n",
" \"\tDanny Baron\tWed, 4 Jul 2001\tdanny.baron@example1.com\t1\tTV (UTF-8: 📺)\tCA\tM\n",
"\"\n",
" \"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\"\n",
" \"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\"\n",
" \"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\"\n",
" \"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\"\n",
" \"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\"\n",
" \"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\"\n",
" \"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\"\n",
" \"\t\"\n",
" \"\tMelanie White\t2001-07-04T12:08:56\tmelanie.white@example2.edu\t1\t<iPhone>\tNC\tF\n",
"\"\n",
" \"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\"\n",
"\"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\" D.\t(\"\"Tab\"\") Baron\"\t2001-07-04\tdanny.baron@example1.com\t1\tWinter jacket\tCA\tM\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\tBen Tyler\t2001/07/04\tben.tyler@example3.org\t1\tFlashlight\tNV\tM\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\tArthur Duff\t2001-07\tarthur.duff@example4.com\t1\tDining table\tOR\tM\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\tDaniel Baron\t2001\tdanny.baron@example1.com\t1\tBike\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\tJean Griffith\t2000\tjean.griffith@example5.org\t1\tPower drill\tWA\tF\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\tMelanie White\t1999\tmelanie.white@example2.edu\t1\t'iPad'\tNC\tF\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\tBen Morisson\t1998\tben.morisson@example6.org\t1\tAmplifier\tFL\tM\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\tArthur Duff\t1997\tarthur.duff@example4.com\t1\tNight table\tOR\tM\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
2019-08-22 01:43:52 +02:00
"Project 1926835461545 has been successfully deleted\n"
2019-08-20 04:30:50 +02:00
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.xml')\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### trimStrings (broken, does not work in the GUI either)\n",
"\n",
"check if spaces before `D.` are deleted"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": 52,
2019-08-20 04:30:50 +02:00
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2019-08-22 01:43:52 +02:00
"id: 1615744471501\n",
2019-08-20 04:30:50 +02:00
"rows: 80\n",
2019-08-22 01:43:52 +02:00
" id: 1615744471501\n",
" url: http://127.0.0.1:3333/project?project=1615744471501\n",
2019-08-20 04:30:50 +02:00
" name: duplicates\n",
2019-08-22 01:43:52 +02:00
" modified: 2019-08-21T23:31:07Z\n",
" created: 2019-08-21T23:31:07Z\n",
2019-08-20 04:30:50 +02:00
" rowCount: 80\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.xml', u'storeBlankRows': True, u'encoding': u'', u'recordPath': [u'root'], u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': True, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n",
" column 001: root\n",
" column 002: root - record\n",
" column 003: root - record - name\n",
" column 004: root - record - date\n",
" column 005: root - record - email\n",
" column 006: root - record - count\n",
" column 007: root - record - purchase\n",
" column 008: root - record - state\n",
" column 009: root - record - gender\n",
"root\troot - record\troot - record - name\troot - record - date\troot - record - email\troot - record - count\troot - record - purchase\troot - record - state\troot - record - gender\n",
"\"\n",
" \"\t\"\n",
" \"\tDanny Baron\tWed, 4 Jul 2001\tdanny.baron@example1.com\t1\tTV (UTF-8: 📺)\tCA\tM\n",
"\"\n",
" \"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\"\n",
" \"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\"\n",
" \"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\"\n",
" \"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\"\n",
" \"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\"\n",
" \"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\"\n",
" \"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\"\n",
" \"\t\"\n",
" \"\tMelanie White\t2001-07-04T12:08:56\tmelanie.white@example2.edu\t1\t<iPhone>\tNC\tF\n",
"\"\n",
" \"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\"\n",
"\"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\" D.\t(\"\"Tab\"\") Baron\"\t2001-07-04\tdanny.baron@example1.com\t1\tWinter jacket\tCA\tM\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\tBen Tyler\t2001/07/04\tben.tyler@example3.org\t1\tFlashlight\tNV\tM\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\tArthur Duff\t2001-07\tarthur.duff@example4.com\t1\tDining table\tOR\tM\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\tDaniel Baron\t2001\tdanny.baron@example1.com\t1\tBike\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\tJean Griffith\t2000\tjean.griffith@example5.org\t1\tPower drill\tWA\tF\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\tMelanie White\t1999\tmelanie.white@example2.edu\t1\t'iPad'\tNC\tF\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\tBen Morisson\t1998\tben.morisson@example6.org\t1\tAmplifier\tFL\tM\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\tArthur Duff\t1997\tarthur.duff@example4.com\t1\tNight table\tOR\tM\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
2019-08-22 01:43:52 +02:00
"Project 1615744471501 has been successfully deleted\n"
2019-08-20 04:30:50 +02:00
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.xml', trimStrings=True)\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### recordPath"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": 53,
2019-08-20 04:30:50 +02:00
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2019-08-22 01:43:52 +02:00
"id: 1843370951454\n",
2019-08-20 04:30:50 +02:00
"rows: 10\n",
2019-08-22 01:43:52 +02:00
" id: 1843370951454\n",
" url: http://127.0.0.1:3333/project?project=1843370951454\n",
2019-08-20 04:30:50 +02:00
" name: duplicates\n",
2019-08-22 01:43:52 +02:00
" modified: 2019-08-21T23:31:07Z\n",
" created: 2019-08-21T23:31:07Z\n",
2019-08-20 04:30:50 +02:00
" rowCount: 10\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.xml', u'storeBlankRows': True, u'encoding': u'', u'recordPath': [u'root', u'record', u'purchase'], u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n",
" column 001: purchase\n",
"purchase\n",
"TV (UTF-8: 📺)\n",
"<iPhone>\n",
"Winter jacket\n",
"Flashlight\n",
"Dining table\n",
"Bike\n",
"Power drill\n",
"'iPad'\n",
"Amplifier\n",
"Night table\n",
2019-08-22 01:43:52 +02:00
"Project 1843370951454 has been successfully deleted\n"
2019-08-20 04:30:50 +02:00
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.xml', recordPath=['root', 'record', 'purchase'])\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### storeEmptyStrings\n",
"\n",
"default: True; set to False for null values\n",
"\n",
"check OpenRefine GUI at url below:\n",
"* All > View > Show/Hide 'null' values in cells\n",
"* row 6 should contain null values in columns state and gender"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": 54,
2019-08-20 04:30:50 +02:00
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2019-08-22 01:43:52 +02:00
"id: 2549624481101\n",
2019-08-20 04:30:50 +02:00
"rows: 10\n",
2019-08-22 01:43:52 +02:00
" id: 2549624481101\n",
" url: http://127.0.0.1:3333/project?project=2549624481101\n",
2019-08-20 04:30:50 +02:00
" name: duplicates\n",
2019-08-22 01:43:52 +02:00
" modified: 2019-08-21T23:31:07Z\n",
" created: 2019-08-21T23:31:07Z\n",
2019-08-20 04:30:50 +02:00
" rowCount: 10\n",
"importOptionMetadata: [{u'storeEmptyStrings': False, u'fileSource': u'data/cli/duplicates.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n",
" column 001: email\n",
" column 002: name\n",
" column 003: state\n",
" column 004: gender\n",
" column 005: purchase\n",
" column 006: count\n",
" column 007: date\n"
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.csv', storeEmptyStrings=False)\n",
"cli.info(p.project_id)"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": 55,
2019-08-20 04:30:50 +02:00
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2019-08-22 01:43:52 +02:00
"Project 2549624481101 has been successfully deleted\n"
2019-08-20 04:30:50 +02:00
]
}
],
"source": [
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## TXT"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### default (line-based)"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": 56,
2019-08-20 04:30:50 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2019-08-22 01:43:52 +02:00
"id: 2029778313736\n",
2019-08-20 04:30:50 +02:00
"rows: 11\n",
2019-08-22 01:43:52 +02:00
" id: 2029778313736\n",
" url: http://127.0.0.1:3333/project?project=2029778313736\n",
2019-08-20 04:30:50 +02:00
" name: duplicates\n",
2019-08-22 01:43:52 +02:00
" modified: 2019-08-21T23:31:07Z\n",
" created: 2019-08-21T23:31:07Z\n",
2019-08-20 04:30:50 +02:00
" rowCount: 11\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.txt', u'storeBlankRows': True, u'encoding': u'', u'ignoreLines': -1, u'projectName': u'duplicates', u'processQuotes': True, u'skipDataLines': -1, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False, u'headerLines': 0}]\n",
" column 001: Column 1\n",
"Column 1\n",
"email name state gender purchase count date \n",
"danny.baron@example1.com Danny Baron CA M TV (UTF-8: 📺) 1 Wed, 4 Jul 2001 \n",
"melanie.white@example2.edu Melanie White NC F <iPhone> 1 2001-07-04T12:08:5\n",
"\"danny.baron@example1.com D.\t(\"\"Tab\"\") Baron CA M Winter jacket 1 2001-07-04 \"\n",
"ben.tyler@example3.org Ben Tyler NV M Flashlight 1 2001/07/04 \n",
"arthur.duff@example4.com Arthur Duff OR M Dining table 1 2001-07 \n",
"danny.baron@example1.com Daniel Baron Bike 1 2001 \n",
"jean.griffith@example5.org Jean Griffith WA F Power drill 1 2000 \n",
"melanie.white@example2.edu Melanie White NC F 'iPad' 1 1999 \n",
"ben.morisson@example6.org Ben Morisson FL M Amplifier 1 1998 \n",
"arthur.duff@example4.com Arthur Duff OR M Night table 1 1997 \n",
2019-08-22 01:43:52 +02:00
"Project 2029778313736 has been successfully deleted\n"
2019-08-20 04:30:50 +02:00
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.txt')\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### linesPerRow\n",
"\n",
"should return 6 rows in 2 columns"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": 57,
2019-08-20 04:30:50 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2019-08-22 01:43:52 +02:00
"id: 1614710460265\n",
2019-08-20 04:30:50 +02:00
"rows: 6\n",
2019-08-22 01:43:52 +02:00
" id: 1614710460265\n",
" url: http://127.0.0.1:3333/project?project=1614710460265\n",
2019-08-20 04:30:50 +02:00
" name: duplicates\n",
2019-08-22 01:43:52 +02:00
" modified: 2019-08-21T23:31:08Z\n",
" created: 2019-08-21T23:31:08Z\n",
2019-08-20 04:30:50 +02:00
" rowCount: 6\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.txt', u'storeBlankRows': True, u'encoding': u'', u'ignoreLines': -1, u'projectName': u'duplicates', u'processQuotes': True, u'limit': -1, u'skipDataLines': -1, u'separator': u',', u'trimStrings': False, u'linesPerRow': 2, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False, u'headerLines': 0}]\n",
" column 001: Column 1\n",
" column 002: Column 2\n",
"Column 1\tColumn 2\n",
"email name state gender purchase count date \tdanny.baron@example1.com Danny Baron CA M TV (UTF-8: 📺) 1 Wed, 4 Jul 2001 \n",
"melanie.white@example2.edu Melanie White NC F <iPhone> 1 2001-07-04T12:08:5\t\"danny.baron@example1.com D.\t(\"\"Tab\"\") Baron CA M Winter jacket 1 2001-07-04 \"\n",
"ben.tyler@example3.org Ben Tyler NV M Flashlight 1 2001/07/04 \tarthur.duff@example4.com Arthur Duff OR M Dining table 1 2001-07 \n",
"danny.baron@example1.com Daniel Baron Bike 1 2001 \tjean.griffith@example5.org Jean Griffith WA F Power drill 1 2000 \n",
"melanie.white@example2.edu Melanie White NC F 'iPad' 1 1999 \tben.morisson@example6.org Ben Morisson FL M Amplifier 1 1998 \n",
"arthur.duff@example4.com Arthur Duff OR M Night table 1 1997 \t\n",
2019-08-22 01:43:52 +02:00
"Project 1614710460265 has been successfully deleted\n"
2019-08-20 04:30:50 +02:00
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.txt', linesPerRow=2)\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### fixed-width: columnWidths and headerLines"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": 58,
2019-08-20 04:30:50 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2019-08-22 01:43:52 +02:00
"id: 1729341878534\n",
2019-08-20 04:30:50 +02:00
"rows: 10\n",
2019-08-22 01:43:52 +02:00
" id: 1729341878534\n",
" url: http://127.0.0.1:3333/project?project=1729341878534\n",
2019-08-20 04:30:50 +02:00
" name: duplicates\n",
2019-08-22 01:43:52 +02:00
" modified: 2019-08-21T23:31:08Z\n",
" created: 2019-08-21T23:31:08Z\n",
2019-08-20 04:30:50 +02:00
" rowCount: 10\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.txt', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'limit': -1, u'separator': u',', u'trimStrings': False, u'columnWidths': [27, 21, 6, 7, 15, 6, 1000], u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False, u'headerLines': 1}]\n",
" column 001: email\n",
" column 002: name\n",
" column 003: state\n",
" column 004: gender\n",
" column 005: purchase\n",
" column 006: count\n",
" column 007: date\n",
"email\tname\tstate\tgender\tpurchase\tcount\tdate\n",
"danny.baron@example1.com \tDanny Baron \tCA \tM \tTV (UTF-8: 📺) \t1 \tWed, 4 Jul 2001 \n",
"melanie.white@example2.edu \tMelanie White \tNC \tF \t<iPhone> \t1 \t2001-07-04T12:08:5\n",
"danny.baron@example1.com \t\" D.\t(\"\"Tab\"\") Baron \"\tCA \tM \tWinter jacket \t1 \t2001-07-04 \n",
"ben.tyler@example3.org \tBen Tyler \tNV \tM \tFlashlight \t1 \t2001/07/04 \n",
"arthur.duff@example4.com \tArthur Duff \tOR \tM \tDining table \t1 \t2001-07 \n",
"danny.baron@example1.com \tDaniel Baron \t \t \tBike \t1 \t2001 \n",
"jean.griffith@example5.org \tJean Griffith \tWA \tF \tPower drill \t1 \t2000 \n",
"melanie.white@example2.edu \tMelanie White \tNC \tF \t'iPad' \t1 \t1999 \n",
"ben.morisson@example6.org \tBen Morisson \tFL \tM \tAmplifier \t1 \t1998 \n",
"arthur.duff@example4.com \tArthur Duff \tOR \tM \tNight table \t1 \t1997 \n",
2019-08-22 01:43:52 +02:00
"Project 1729341878534 has been successfully deleted\n"
2019-08-20 04:30:50 +02:00
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.txt', columnWidths=[27, 21, 6, 7, 15, 6, 1000], headerLines=1)\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ZIP"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### default\n",
"\n",
"should contain 16 rows"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": 59,
2019-08-20 04:30:50 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2019-08-22 01:43:52 +02:00
"id: 2279718038457\n",
2019-08-20 04:30:50 +02:00
"rows: 16\n",
2019-08-22 01:43:52 +02:00
" id: 2279718038457\n",
" url: http://127.0.0.1:3333/project?project=2279718038457\n",
2019-08-20 04:30:50 +02:00
" name: duplicates\n",
2019-08-22 01:43:52 +02:00
" modified: 2019-08-21T23:31:08Z\n",
" created: 2019-08-21T23:31:08Z\n",
2019-08-20 04:30:50 +02:00
" rowCount: 16\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'duplicates2.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}, {u'storeEmptyStrings': True, u'fileSource': u'duplicates2.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n",
" column 001: email\n",
" column 002: name\n",
" column 003: state\n",
" column 004: gender\n",
" column 005: purchase\n",
" column 006: count\n",
" column 007: date\n",
"email\tname\tstate\tgender\tpurchase\tcount\tdate\n",
"danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: 📺)\t1\tWed, 4 Jul 2001\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\t<iPhone>\t1\t2001-07-04T12:08:56\n",
"danny.baron@example1.com\t\" D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n",
"ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n",
"danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n",
"jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n",
"ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n",
"danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: 📺), Winter jacket, bike\t3\tWed, 4 Jul 2001, 2001-07-04, 2001\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\t<iPhone>, 'iPad'\t2\t2001-07-04T12:08:56, 1999\n",
"ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table, Night table\t2\t2001-07, 1997\n",
"jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n",
"ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n",
2019-08-22 01:43:52 +02:00
"Project 2279718038457 has been successfully deleted\n"
2019-08-20 04:30:50 +02:00
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.zip')\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### includeFileSources\n",
"\n",
"should contain column File"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": 60,
2019-08-20 04:30:50 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2019-08-22 01:43:52 +02:00
"id: 2100283089198\n",
2019-08-20 04:30:50 +02:00
"rows: 16\n",
2019-08-22 01:43:52 +02:00
" id: 2100283089198\n",
" url: http://127.0.0.1:3333/project?project=2100283089198\n",
2019-08-20 04:30:50 +02:00
" name: duplicates\n",
2019-08-22 01:43:52 +02:00
" modified: 2019-08-21T23:31:08Z\n",
" created: 2019-08-21T23:31:08Z\n",
2019-08-20 04:30:50 +02:00
" rowCount: 16\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'duplicates2.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': True}, {u'storeEmptyStrings': True, u'fileSource': u'duplicates2.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': True}]\n",
" column 001: File\n",
" column 002: email\n",
" column 003: name\n",
" column 004: state\n",
" column 005: gender\n",
" column 006: purchase\n",
" column 007: count\n",
" column 008: date\n",
"File\temail\tname\tstate\tgender\tpurchase\tcount\tdate\n",
"duplicates.csv\tdanny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: 📺)\t1\tWed, 4 Jul 2001\n",
"duplicates.csv\tmelanie.white@example2.edu\tMelanie White\tNC\tF\t<iPhone>\t1\t2001-07-04T12:08:56\n",
"duplicates.csv\tdanny.baron@example1.com\t\" D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n",
"duplicates.csv\tben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n",
"duplicates.csv\tarthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n",
"duplicates.csv\tdanny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n",
"duplicates.csv\tjean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n",
"duplicates.csv\tmelanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n",
"duplicates.csv\tben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n",
"duplicates.csv\tarthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n",
"duplicates2.csv\tdanny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: 📺), Winter jacket, bike\t3\tWed, 4 Jul 2001, 2001-07-04, 2001\n",
"duplicates2.csv\tmelanie.white@example2.edu\tMelanie White\tNC\tF\t<iPhone>, 'iPad'\t2\t2001-07-04T12:08:56, 1999\n",
"duplicates2.csv\tben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n",
"duplicates2.csv\tarthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table, Night table\t2\t2001-07, 1997\n",
"duplicates2.csv\tjean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n",
"duplicates2.csv\tben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n",
2019-08-22 01:43:52 +02:00
"Project 2100283089198 has been successfully deleted\n"
2019-08-20 04:30:50 +02:00
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.zip', includeFileSources=True)\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ODS (broken in OpenRefine >=2.8)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### default\n",
"\n",
"many blank columns and rows in OpenRefine <=2.7 (also with manual import via GUI)"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": 61,
2019-08-20 04:30:50 +02:00
"metadata": {},
"outputs": [
{
2019-08-22 01:43:52 +02:00
"ename": "Exception",
"evalue": "Project not created",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mException\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-61-d02472fdd85b>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcli\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcreate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'data/cli/duplicates.ods'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mcli\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mproject_id\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mcli\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexport\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mproject_id\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mcli\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdelete\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mproject_id\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/felix/.local/lib/python2.7/site-packages/google/refine/cli.pyc\u001b[0m in \u001b[0;36mcreate\u001b[0;34m(project_file, project_format, columnWidths, encoding, guessCellValueTypes, headerLines, ignoreLines, includeFileSources, limit, linesPerRow, processQuotes, projectName, projectTags, recordPath, separator, sheets, skipDataLines, storeBlankCellsAsNulls, storeBlankRows, storeEmptyStrings, trimStrings)\u001b[0m\n\u001b[1;32m 125\u001b[0m \u001b[0mstore_blank_cells_as_nulls\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstoreBlankCellsAsNulls\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 126\u001b[0m \u001b[0minclude_file_sources\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mincludeFileSources\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 127\u001b[0;31m **kwargs)\n\u001b[0m\u001b[1;32m 128\u001b[0m \u001b[0mrows\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mproject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdo_json\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'get-rows'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'total'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 129\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrows\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/felix/.local/lib/python2.7/site-packages/google/refine/refine.pyc\u001b[0m in \u001b[0;36mnew_project\u001b[0;34m(self, project_file, project_url, project_name, project_format, encoding, separator, ignore_lines, header_lines, skip_data_lines, limit, store_blank_rows, guess_cell_value_types, process_quotes, store_blank_cells_as_nulls, include_file_sources, **opts)\u001b[0m\n\u001b[1;32m 279\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mRefineProject\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mserver\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mproject_id\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 280\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 281\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Project not created'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 282\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 283\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mException\u001b[0m: Project not created"
2019-08-20 04:30:50 +02:00
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.ods')\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### sheets"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"first sheet from file with 2 sheets"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": null,
2019-08-20 04:30:50 +02:00
"metadata": {},
2019-08-22 01:43:52 +02:00
"outputs": [],
2019-08-20 04:30:50 +02:00
"source": [
"p = cli.create('data/cli/duplicates2.ods', sheets=[0])\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"both sheets from file with 2 sheets: should contain 16 rows"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": null,
2019-08-20 04:30:50 +02:00
"metadata": {
"scrolled": false
},
2019-08-22 01:43:52 +02:00
"outputs": [],
2019-08-20 04:30:50 +02:00
"source": [
"p = cli.create('data/cli/duplicates2.ods', sheets=[0, 1])\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## XLS (broken in OpenRefine >=2.8)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### default"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": null,
2019-08-20 04:30:50 +02:00
"metadata": {},
2019-08-22 01:43:52 +02:00
"outputs": [],
2019-08-20 04:30:50 +02:00
"source": [
"p = cli.create('data/cli/duplicates.xls')\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### sheets"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"first sheet from file with 2 sheets"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": null,
2019-08-20 04:30:50 +02:00
"metadata": {},
2019-08-22 01:43:52 +02:00
"outputs": [],
2019-08-20 04:30:50 +02:00
"source": [
"p = cli.create('data/cli/duplicates2.xls', sheets=[0])\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"both sheets from file with 2 sheets: should contain 16 rows"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": null,
2019-08-20 04:30:50 +02:00
"metadata": {
"scrolled": false
},
2019-08-22 01:43:52 +02:00
"outputs": [],
2019-08-20 04:30:50 +02:00
"source": [
"p = cli.create('data/cli/duplicates2.xls', sheets=[0, 1])\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## XLSX (broken in OpenRefine >=2.8)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### default"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": null,
2019-08-20 04:30:50 +02:00
"metadata": {},
2019-08-22 01:43:52 +02:00
"outputs": [],
2019-08-20 04:30:50 +02:00
"source": [
"p = cli.create('data/cli/duplicates.xlsx')\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### sheets"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"first sheet from file with 2 sheets"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": null,
2019-08-20 04:30:50 +02:00
"metadata": {},
2019-08-22 01:43:52 +02:00
"outputs": [],
2019-08-20 04:30:50 +02:00
"source": [
"p = cli.create('data/cli/duplicates2.xlsx', sheets=[0])\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"both sheets from file with 2 sheets: should contain 16 rows"
]
},
{
"cell_type": "code",
2019-08-22 01:43:52 +02:00
"execution_count": null,
2019-08-20 04:30:50 +02:00
"metadata": {
"scrolled": false
},
2019-08-22 01:43:52 +02:00
"outputs": [],
2019-08-20 04:30:50 +02:00
"source": [
"p = cli.create('data/cli/duplicates2.xlsx', sheets=[0, 1])\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.16"
}
},
"nbformat": 4,
"nbformat_minor": 2
}