realigned create/new_project to upstream
new feature: xml root element will be discovered if recordPath is not set bugfix: newly introduced option projectTags was not working in 0.3.7 bugfix: txt defaulted to fixed-width (should be line-based) bugfix: default recordPath for json was not working in 0.3.7 bugfix: default sheets option was broken (but xls, xlsx, ods is broken in OpenRefine >=2.8 anyway, see #4) tests: added sample files and an ipython notebook for comprehensive tests of create option
This commit is contained in:
parent
7ad79af3ca
commit
375ac42be0
|
@ -145,7 +145,7 @@ group5.add_option('--projectTags', dest='projectTags',
|
|||
help='(all formats), please provide tags in multiple arguments, e.g. --projectTags=beta --projectTags=client1')
|
||||
group5.add_option('--recordPath', dest='recordPath',
|
||||
action='append',
|
||||
help='(xml,json), please provide path in multiple arguments without slashes, e.g. /collection/record/ should be entered like this: --recordPath=collection --recordPath=record, default xml: record, default json: _ _')
|
||||
help='(xml,json), please provide path in multiple arguments, e.g. /collection/record/ should be entered: --recordPath=collection --recordPath=record, default xml: root element, default json: _ _')
|
||||
group5.add_option('--separator', dest='separator',
|
||||
help='(csv,tsv), default csv: , default tsv: \\t')
|
||||
group5.add_option('--sheets', dest='sheets',
|
||||
|
|
|
@ -25,6 +25,7 @@ import ssl
|
|||
import sys
|
||||
import time
|
||||
import urllib
|
||||
from xml.etree import ElementTree
|
||||
|
||||
from google.refine import refine
|
||||
|
||||
|
@ -43,7 +44,6 @@ def apply(project_id, history_file):
|
|||
|
||||
def create(project_file,
|
||||
project_format=None,
|
||||
project_name=None,
|
||||
columnWidths=None,
|
||||
encoding=None,
|
||||
guessCellValueTypes=False,
|
||||
|
@ -54,6 +54,7 @@ def create(project_file,
|
|||
linesPerRow=None,
|
||||
processQuotes=True,
|
||||
projectName=None,
|
||||
projectTags=None,
|
||||
recordPath=None,
|
||||
separator=None,
|
||||
sheets=None,
|
||||
|
@ -69,15 +70,15 @@ def create(project_file,
|
|||
project_format = os.path.splitext(project_file)[1][1:].lower()
|
||||
if project_format == 'txt':
|
||||
try:
|
||||
columnWidths
|
||||
columnWidths[0]
|
||||
project_format = 'fixed-width'
|
||||
except NameError:
|
||||
except TypeError:
|
||||
project_format = 'line-based'
|
||||
# defaults for each file type
|
||||
if project_format == 'xml':
|
||||
project_format = 'text/xml'
|
||||
if not recordPath:
|
||||
recordPath = 'record'
|
||||
recordPath = [ElementTree.parse(project_file).getroot().tag]
|
||||
elif project_format == 'csv':
|
||||
project_format = 'text/line-based/*sv'
|
||||
elif project_format == 'tsv':
|
||||
|
@ -95,22 +96,35 @@ def create(project_file,
|
|||
elif project_format == 'json':
|
||||
project_format = 'text/json'
|
||||
if not recordPath:
|
||||
recordPath = ('_', '_')
|
||||
recordPath = ['_', '_']
|
||||
elif project_format == 'xls':
|
||||
project_format = 'binary/text/xml/xls/xlsx'
|
||||
if not sheets:
|
||||
sheets = 0
|
||||
sheets = [0]
|
||||
# TODO: new format for sheets option introduced in OpenRefine 2.8
|
||||
elif project_format == 'xlsx':
|
||||
project_format = 'binary/text/xml/xls/xlsx'
|
||||
if not sheets:
|
||||
sheets = 0
|
||||
sheets = [0]
|
||||
# TODO: new format for sheets option introduced in OpenRefine 2.8
|
||||
elif project_format == 'ods':
|
||||
project_format = 'text/xml/ods'
|
||||
if not sheets:
|
||||
sheets = 0
|
||||
sheets = [0]
|
||||
# TODO: new format for sheets option introduced in OpenRefine 2.8
|
||||
# execute
|
||||
kwargs = {k: v for k, v in vars().items() if v is not None}
|
||||
project = refine.Refine(refine.RefineServer()).new_project(**kwargs)
|
||||
project = refine.Refine(refine.RefineServer()).new_project(
|
||||
guess_cell_value_types=guessCellValueTypes,
|
||||
ignore_lines=ignoreLines,
|
||||
header_lines=headerLines,
|
||||
skip_data_lines=skipDataLines,
|
||||
store_blank_rows=storeBlankRows,
|
||||
process_quotes=processQuotes,
|
||||
project_name=projectName,
|
||||
store_blank_cells_as_nulls=storeBlankCellsAsNulls,
|
||||
include_file_sources=includeFileSources,
|
||||
**kwargs)
|
||||
rows = project.do_json('get-rows')['total']
|
||||
if rows > 0:
|
||||
print('{0}: {1}'.format('id', project.project_id))
|
||||
|
|
|
@ -147,41 +147,127 @@ class Refine:
|
|||
"""Open a Refine project."""
|
||||
return RefineProject(self.server, project_id)
|
||||
|
||||
def new_project(self, project_file=None, project_name=None,
|
||||
project_format='text/line-based/*sv', **kwargs):
|
||||
"""Create a Refine project."""
|
||||
# These aren't used yet but are included for reference
|
||||
new_project_defaults = {
|
||||
'text/line-based/*sv': {
|
||||
'encoding': '',
|
||||
'separator': ',',
|
||||
'ignore_lines': -1,
|
||||
'header_lines': 1,
|
||||
'skip_data_lines': 0,
|
||||
'limit': -1,
|
||||
'store_blank_rows': True,
|
||||
'guess_cell_value_types': True,
|
||||
'process_quotes': True,
|
||||
'store_blank_cells_as_nulls': True,
|
||||
'include_file_sources': False},
|
||||
'text/line-based': {
|
||||
'encoding': '',
|
||||
'lines_per_row': 1,
|
||||
'ignore_lines': -1,
|
||||
'limit': -1,
|
||||
'skip_data_lines': -1,
|
||||
'store_blank_rows': True,
|
||||
'store_blank_cells_as_nulls': True,
|
||||
'include_file_sources': False},
|
||||
'text/line-based/fixed-width': {
|
||||
'encoding': '',
|
||||
'column_widths': [20],
|
||||
'ignore_lines': -1,
|
||||
'header_lines': 0,
|
||||
'skip_data_lines': 0,
|
||||
'limit': -1,
|
||||
'guess_cell_value_types': False,
|
||||
'store_blank_rows': True,
|
||||
'store_blank_cells_as_nulls': True,
|
||||
'include_file_sources': False},
|
||||
'text/line-based/pc-axis': {
|
||||
'encoding': '',
|
||||
'limit': -1,
|
||||
'skip_data_lines': -1,
|
||||
'include_file_sources': False},
|
||||
'text/rdf+n3': {'encoding': ''},
|
||||
'text/xml/ods': {
|
||||
'sheets': [],
|
||||
'ignore_lines': -1,
|
||||
'header_lines': 1,
|
||||
'skip_data_lines': 0,
|
||||
'limit': -1,
|
||||
'store_blank_rows': True,
|
||||
'store_blank_cells_as_nulls': True,
|
||||
'include_file_sources': False},
|
||||
'binary/xls': {
|
||||
'xml_based': False,
|
||||
'sheets': [],
|
||||
'ignore_lines': -1,
|
||||
'header_lines': 1,
|
||||
'skip_data_lines': 0,
|
||||
'limit': -1,
|
||||
'store_blank_rows': True,
|
||||
'store_blank_cells_as_nulls': True,
|
||||
'include_file_sources': False}
|
||||
}
|
||||
|
||||
defaults = {'guessCellValueTypes': False,
|
||||
'headerLines': 1,
|
||||
'ignoreLines': -1,
|
||||
'includeFileSources': False,
|
||||
'limit': -1,
|
||||
'linesPerRow': 1,
|
||||
'processQuotes': True,
|
||||
'separator': ',',
|
||||
'skipDataLines': 0,
|
||||
'storeBlankCellsAsNulls': True,
|
||||
'storeBlankRows': True,
|
||||
'storeEmptyStrings': True,
|
||||
'trimStrings': False}
|
||||
def new_project(self, project_file=None, project_url=None, project_name=None, project_format='text/line-based/*sv',
|
||||
encoding='',
|
||||
separator=',',
|
||||
ignore_lines=-1,
|
||||
header_lines=1,
|
||||
skip_data_lines=0,
|
||||
limit=-1,
|
||||
store_blank_rows=True,
|
||||
guess_cell_value_types=False,
|
||||
process_quotes=True,
|
||||
store_blank_cells_as_nulls=True,
|
||||
include_file_sources=False,
|
||||
**opts):
|
||||
|
||||
# options
|
||||
options = {'format': project_format}
|
||||
if project_file is not None:
|
||||
options['project-file'] = {'fd': open(project_file),
|
||||
'filename': project_file}
|
||||
if (project_file and project_url) or (not project_file and not project_url):
|
||||
raise ValueError('One (only) of project_file and project_url must be set')
|
||||
|
||||
def s(opt):
|
||||
if isinstance(opt, bool):
|
||||
return 'true' if opt else 'false'
|
||||
if opt is None:
|
||||
return ''
|
||||
return str(opt)
|
||||
|
||||
# the new APIs requires a json in the 'option' POST or GET argument
|
||||
# POST is broken at the moment, so we send it in the URL
|
||||
new_style_options = dict(opts, **{
|
||||
'encoding': s(encoding),
|
||||
})
|
||||
params = {
|
||||
'options': json.dumps(new_style_options),
|
||||
}
|
||||
|
||||
# old style options
|
||||
options = {
|
||||
'format': project_format,
|
||||
'separator': s(separator),
|
||||
'ignore-lines': s(ignore_lines),
|
||||
'header-lines': s(header_lines),
|
||||
'skip-data-lines': s(skip_data_lines),
|
||||
'limit': s(limit),
|
||||
'guess-value-type': s(guess_cell_value_types),
|
||||
'process-quotes': s(process_quotes),
|
||||
'store-blank-rows': s(store_blank_rows),
|
||||
'store-blank-cells-as-nulls': s(store_blank_cells_as_nulls),
|
||||
'include-file-sources': s(include_file_sources),
|
||||
}
|
||||
|
||||
if project_url is not None:
|
||||
options['url'] = project_url
|
||||
elif project_file is not None:
|
||||
options['project-file'] = {
|
||||
'fd': open(project_file),
|
||||
'filename': project_file,
|
||||
}
|
||||
if project_name is None:
|
||||
# make a name for itself by stripping extension and directories
|
||||
project_name = (project_file or 'New project').rsplit('.', 1)[0]
|
||||
project_name = os.path.basename(project_name)
|
||||
options['project-name'] = project_name
|
||||
|
||||
# params
|
||||
params_dict = dict(defaults)
|
||||
params_dict.update(kwargs)
|
||||
params = {'options': json.dumps(params_dict)}
|
||||
|
||||
# submit
|
||||
response = self.server.urlopen(
|
||||
'create-project-from-upload', options, params
|
||||
)
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,11 @@
|
|||
email,name,state,gender,purchase,count,date
|
||||
danny.baron@example1.com,Danny Baron,CA,M,TV (UTF-8: 📺),1,"Wed, 4 Jul 2001"
|
||||
melanie.white@example2.edu,Melanie White,NC,F,<iPhone>,1,2001-07-04T12:08:56
|
||||
danny.baron@example1.com, D. ("Tab") Baron,CA,M,Winter jacket,1,2001-07-04
|
||||
ben.tyler@example3.org,Ben Tyler,NV,M,Flashlight,1,2001/07/04
|
||||
arthur.duff@example4.com,Arthur Duff,OR,M,Dining table,1,2001-07
|
||||
danny.baron@example1.com,Daniel Baron,,,Bike,1,2001
|
||||
jean.griffith@example5.org,Jean Griffith,WA,F,Power drill,1,2000
|
||||
melanie.white@example2.edu,Melanie White,NC,F,'iPad',1,1999
|
||||
ben.morisson@example6.org,Ben Morisson,FL,M,Amplifier,1,1998
|
||||
arthur.duff@example4.com,Arthur Duff,OR,M,Night table,1,1997
|
Can't render this file because it contains an unexpected character in line 4 and column 33.
|
|
@ -0,0 +1,92 @@
|
|||
[
|
||||
{
|
||||
"email": "danny.baron@example1.com",
|
||||
"name": "Danny Baron",
|
||||
"state": "CA",
|
||||
"gender": "M",
|
||||
"purchase": "TV (UTF-8: 📺)",
|
||||
"count": 1,
|
||||
"date": "Wed, 4 Jul 2001"
|
||||
},
|
||||
{
|
||||
"email": "melanie.white@example2.edu",
|
||||
"name": "Melanie White",
|
||||
"state": "NC",
|
||||
"gender": "F",
|
||||
"purchase": "<iPhone>",
|
||||
"count": 1,
|
||||
"date": "2001-07-04T12:08:56"
|
||||
},
|
||||
{
|
||||
"email": "danny.baron@example1.com",
|
||||
"name": " D.\t(\"Tab\") Baron",
|
||||
"state": "CA",
|
||||
"gender": "M",
|
||||
"purchase": "Winter jacket",
|
||||
"count": 1,
|
||||
"date": "2001-07-04"
|
||||
},
|
||||
{
|
||||
"email": "ben.tyler@example3.org",
|
||||
"name": "Ben Tyler",
|
||||
"state": "NV",
|
||||
"gender": "M",
|
||||
"purchase": "Flashlight",
|
||||
"count": 1,
|
||||
"date": "2001/07/04"
|
||||
},
|
||||
{
|
||||
"email": "arthur.duff@example4.com",
|
||||
"name": "Arthur Duff",
|
||||
"state": "OR",
|
||||
"gender": "M",
|
||||
"purchase": "Dining table",
|
||||
"count": 1,
|
||||
"date": "2001-07"
|
||||
},
|
||||
{
|
||||
"email": "danny.baron@example1.com",
|
||||
"name": "Daniel Baron",
|
||||
"state": "",
|
||||
"gender": "",
|
||||
"purchase": "Bike",
|
||||
"count": 1,
|
||||
"date": 2001
|
||||
},
|
||||
{
|
||||
"email": "jean.griffith@example5.org",
|
||||
"name": "Jean Griffith",
|
||||
"state": "WA",
|
||||
"gender": "F",
|
||||
"purchase": "Power drill",
|
||||
"count": 1,
|
||||
"date": 2000
|
||||
},
|
||||
{
|
||||
"email": "melanie.white@example2.edu",
|
||||
"name": "Melanie White",
|
||||
"state": "NC",
|
||||
"gender": "F",
|
||||
"purchase": "'iPad'",
|
||||
"count": 1,
|
||||
"date": 1999
|
||||
},
|
||||
{
|
||||
"email": "ben.morisson@example6.org",
|
||||
"name": "Ben Morisson",
|
||||
"state": "FL",
|
||||
"gender": "M",
|
||||
"purchase": "Amplifier",
|
||||
"count": 1,
|
||||
"date": 1998
|
||||
},
|
||||
{
|
||||
"email": "arthur.duff@example4.com",
|
||||
"name": "Arthur Duff",
|
||||
"state": "OR",
|
||||
"gender": "M",
|
||||
"purchase": "Night table",
|
||||
"count": 1,
|
||||
"date": 1997
|
||||
}
|
||||
]
|
Binary file not shown.
|
@ -0,0 +1,11 @@
|
|||
email name state gender purchase count date
|
||||
danny.baron@example1.com Danny Baron CA M TV (UTF-8: 📺) 1 Wed, 4 Jul 2001
|
||||
melanie.white@example2.edu Melanie White NC F <iPhone> 1 2001-07-04T12:08:56
|
||||
danny.baron@example1.com "D. (""Tab"") Baron" CA M Winter jacket 1 2001-07-04
|
||||
ben.tyler@example3.org Ben Tyler NV M Flashlight 1 2001/07/04
|
||||
arthur.duff@example4.com Arthur Duff OR M Dining table 1 2001-07
|
||||
danny.baron@example1.com Daniel Baron Bike 1 2001
|
||||
jean.griffith@example5.org Jean Griffith WA F Power drill 1 2000
|
||||
melanie.white@example2.edu Melanie White NC F 'iPad' 1 1999
|
||||
ben.morisson@example6.org Ben Morisson FL M Amplifier 1 1998
|
||||
arthur.duff@example4.com Arthur Duff OR M Night table 1 1997
|
|
|
@ -0,0 +1,11 @@
|
|||
email name state gender purchase count date
|
||||
danny.baron@example1.com Danny Baron CA M TV (UTF-8: 📺) 1 Wed, 4 Jul 2001
|
||||
melanie.white@example2.edu Melanie White NC F <iPhone> 1 2001-07-04T12:08:5
|
||||
danny.baron@example1.com D. ("Tab") Baron CA M Winter jacket 1 2001-07-04
|
||||
ben.tyler@example3.org Ben Tyler NV M Flashlight 1 2001/07/04
|
||||
arthur.duff@example4.com Arthur Duff OR M Dining table 1 2001-07
|
||||
danny.baron@example1.com Daniel Baron Bike 1 2001
|
||||
jean.griffith@example5.org Jean Griffith WA F Power drill 1 2000
|
||||
melanie.white@example2.edu Melanie White NC F 'iPad' 1 1999
|
||||
ben.morisson@example6.org Ben Morisson FL M Amplifier 1 1998
|
||||
arthur.duff@example4.com Arthur Duff OR M Night table 1 1997
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,93 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<root>
|
||||
<record>
|
||||
<email>danny.baron@example1.com</email>
|
||||
<name>Danny Baron</name>
|
||||
<state>CA</state>
|
||||
<gender>M</gender>
|
||||
<purchase>TV (UTF-8: 📺)</purchase>
|
||||
<count>1</count>
|
||||
<date>Wed, 4 Jul 2001</date>
|
||||
</record>
|
||||
<record>
|
||||
<email>melanie.white@example2.edu</email>
|
||||
<name>Melanie White</name>
|
||||
<state>NC</state>
|
||||
<gender>F</gender>
|
||||
<purchase><iPhone></purchase>
|
||||
<count>1</count>
|
||||
<date>2001-07-04T12:08:56</date>
|
||||
</record>
|
||||
<record>
|
||||
<email>danny.baron@example1.com</email>
|
||||
<name> D. ("Tab") Baron</name>
|
||||
<state>CA</state>
|
||||
<gender>M</gender>
|
||||
<purchase>Winter jacket</purchase>
|
||||
<count>1</count>
|
||||
<date>2001-07-04</date>
|
||||
</record>
|
||||
<record>
|
||||
<email>ben.tyler@example3.org</email>
|
||||
<name>Ben Tyler</name>
|
||||
<state>NV</state>
|
||||
<gender>M</gender>
|
||||
<purchase>Flashlight</purchase>
|
||||
<count>1</count>
|
||||
<date>2001/07/04</date>
|
||||
</record>
|
||||
<record>
|
||||
<email>arthur.duff@example4.com</email>
|
||||
<name>Arthur Duff</name>
|
||||
<state>OR</state>
|
||||
<gender>M</gender>
|
||||
<purchase>Dining table</purchase>
|
||||
<count>1</count>
|
||||
<date>2001-07</date>
|
||||
</record>
|
||||
<record>
|
||||
<email>danny.baron@example1.com</email>
|
||||
<name>Daniel Baron</name>
|
||||
<state></state>
|
||||
<gender></gender>
|
||||
<purchase>Bike</purchase>
|
||||
<count>1</count>
|
||||
<date>2001</date>
|
||||
</record>
|
||||
<record>
|
||||
<email>jean.griffith@example5.org</email>
|
||||
<name>Jean Griffith</name>
|
||||
<state>WA</state>
|
||||
<gender>F</gender>
|
||||
<purchase>Power drill</purchase>
|
||||
<count>1</count>
|
||||
<date>2000</date>
|
||||
</record>
|
||||
<record>
|
||||
<email>melanie.white@example2.edu</email>
|
||||
<name>Melanie White</name>
|
||||
<state>NC</state>
|
||||
<gender>F</gender>
|
||||
<purchase>'iPad'</purchase>
|
||||
<count>1</count>
|
||||
<date>1999</date>
|
||||
</record>
|
||||
<record>
|
||||
<email>ben.morisson@example6.org</email>
|
||||
<name>Ben Morisson</name>
|
||||
<state>FL</state>
|
||||
<gender>M</gender>
|
||||
<purchase>Amplifier</purchase>
|
||||
<count>1</count>
|
||||
<date>1998</date>
|
||||
</record>
|
||||
<record>
|
||||
<email>arthur.duff@example4.com</email>
|
||||
<name>Arthur Duff</name>
|
||||
<state>OR</state>
|
||||
<gender>M</gender>
|
||||
<purchase>Night table</purchase>
|
||||
<count>1</count>
|
||||
<date>1997</date>
|
||||
</record>
|
||||
</root>
|
Binary file not shown.
|
@ -0,0 +1,10 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<record>
|
||||
<email>danny.baron@example1.com</email>
|
||||
<name>Danny Baron</name>
|
||||
<state>CA</state>
|
||||
<gender>M</gender>
|
||||
<purchase>TV (UTF-8: 📺)</purchase>
|
||||
<count>1</count>
|
||||
<date>Wed, 4 Jul 2001</date>
|
||||
</record>
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue