diff --git a/google/refine/facet.py b/google/refine/facet.py index 2c5f254..09e7578 100644 --- a/google/refine/facet.py +++ b/google/refine/facet.py @@ -28,6 +28,7 @@ def to_camel(attr): return (attr[0].lower() + re.sub(r'_(.)', lambda x: x.group(1).upper(), attr[1:])) + def from_camel(attr): """convert thisAttrName to this_attr_name.""" # Don't add an underscore for capitalized first letter @@ -35,8 +36,8 @@ def from_camel(attr): class Facet(object): - def __init__(self, column, type, **options): - self.type = type + def __init__(self, column, facet_type, **options): + self.type = facet_type self.name = column self.column_name = column for k, v in options.items(): @@ -50,17 +51,17 @@ class Facet(object): class TextFilterFacet(Facet): def __init__(self, column, query, **options): super(TextFilterFacet, self).__init__( - column, query=query, case_sensitive=False, type='text', + column, query=query, case_sensitive=False, facet_type='text', mode='text', **options) class TextFacet(Facet): def __init__(self, column, selection=None, expression='value', - omit_blank=False, omit_error=False, select_blank=False, - select_error=False, invert=False, **options): + omit_blank=False, omit_error=False, select_blank=False, + select_error=False, invert=False, **options): super(TextFacet, self).__init__( column, - type='list', + facet_type='list', omit_blank=omit_blank, omit_error=omit_error, select_blank=select_blank, @@ -99,37 +100,39 @@ class BoolFacet(TextFacet): raise ValueError('selection must be True or False.') if expression is None: raise ValueError('Missing expression') - super(BoolFacet, self).__init__(column, - expression=expression, selection=selection) + super(BoolFacet, self).__init__( + column, expression=expression, selection=selection) class StarredFacet(BoolFacet): def __init__(self, selection=None): - super(StarredFacet, self).__init__('', - expression='row.starred', selection=selection) + super(StarredFacet, self).__init__( + '', expression='row.starred', selection=selection) class FlaggedFacet(BoolFacet): def __init__(self, selection=None): - super(FlaggedFacet, self).__init__('', - expression='row.flagged', selection=selection) + super(FlaggedFacet, self).__init__( + '', expression='row.flagged', selection=selection) class BlankFacet(BoolFacet): def __init__(self, column, selection=None): - super(BlankFacet, self).__init__(column, - expression='isBlank(value)', selection=selection) + super(BlankFacet, self).__init__( + column, expression='isBlank(value)', selection=selection) class ReconJudgmentFacet(TextFacet): def __init__(self, column, **options): - super(ReconJudgmentFacet, self).__init__(column, + super(ReconJudgmentFacet, self).__init__( + column, expression=('forNonBlank(cell.recon.judgment, v, v, ' 'if(isNonBlank(value), "(unreconciled)", "(blank)"))'), **options) # Capitalize 'From' to get around python's reserved word. +#noinspection PyPep8Naming class NumericFacet(Facet): def __init__(self, column, From=None, to=None, expression='value', select_blank=True, select_error=True, select_non_numeric=True, @@ -139,7 +142,7 @@ class NumericFacet(Facet): From=From, to=to, expression=expression, - type='range', + facet_type='range', select_blank=select_blank, select_error=select_error, select_non_numeric=select_non_numeric, @@ -155,10 +158,12 @@ class NumericFacet(Facet): class FacetResponse(object): """Class for unpacking an individual facet response.""" def __init__(self, facet): + self.name = None for k, v in facet.items(): if isinstance(k, bool) or isinstance(k, basestring): setattr(self, from_camel(k), v) self.choices = {} + class FacetChoice(object): def __init__(self, c): self.count = c['c'] @@ -188,11 +193,14 @@ class FacetsResponse(object): def __init__(self, engine, facets): class FacetResponseContainer(object): facets = None + def __init__(self, facet_responses): self.facets = [FacetResponse(fr) for fr in facet_responses] + def __iter__(self): for facet in self.facets: yield facet + def __getitem__(self, index): if not isinstance(index, int): index = engine.facet_index_by_id[id(index)] diff --git a/google/refine/history.py b/google/refine/history.py index 048abcf..ad8ac36 100644 --- a/google/refine/history.py +++ b/google/refine/history.py @@ -18,15 +18,13 @@ Google Refine history: parsing responses. # You should have received a copy of the GNU General Public License # along with this program. If not, see -import json -import re - class HistoryEntry(object): # N.B. e.g. **response['historyEntry'] won't work as keys are unicode :-/ - def __init__(self, id=None, time=None, description=None, **kwargs): - if id is None: + #noinspection PyUnusedLocal + def __init__(self, history_entry_id=None, time=None, description=None, **kwargs): + if history_entry_id is None: raise ValueError('History entry id must be set') - self.id = id + self.id = history_entry_id self.description = description self.time = time diff --git a/google/refine/refine.py b/google/refine/refine.py index ce6abc1..aa45638 100644 --- a/google/refine/refine.py +++ b/google/refine/refine.py @@ -50,7 +50,7 @@ class RefineServer(object): def __init__(self, server=None): if server is None: - server=self.url() + server = self.url() self.server = server[:-1] if server.endswith('/') else server self.__version = None # see version @property below @@ -77,7 +77,7 @@ class RefineServer(object): url += '?' + urllib.urlencode(params) req = urllib2.Request(url) if data: - req.add_data(data) # data = urllib.urlencode(data) + req.add_data(data) # data = urllib.urlencode(data) #req.add_header('Accept-Encoding', 'gzip') try: response = urllib2.urlopen(req) @@ -114,6 +114,7 @@ class RefineServer(object): self.__version = self.get_version()['version'] return self.__version + class Refine: """Class representing a connection to a Refine server.""" def __init__(self, server): @@ -145,19 +146,19 @@ class Refine: return RefineProject(self.server, project_id) def new_project(self, project_file=None, project_url=None, - project_name=None, - split_into_columns=True, - separator='', - ignore_initial_non_blank_lines=0, - header_lines=1, # use 0 if your data has no header - skip_initial_data_rows=0, - limit=None, # no more than this number of rows - guess_value_type=True, # numbers, dates, etc. - ignore_quotes=False): + project_name=None, + split_into_columns=True, + separator='', + ignore_initial_non_blank_lines=0, + header_lines=1, # use 0 if your data has no header + skip_initial_data_rows=0, + limit=None, # no more than this number of rows + guess_value_type=True, # numbers, dates, etc. + ignore_quotes=False): - if ((project_file and project_url) or - (not project_file and not project_url)): + if (project_file and project_url) or (not project_file and not project_url): raise ValueError('One (only) of project_file and project_url must be set') + def s(opt): if isinstance(opt, bool): return 'on' if opt else '' @@ -211,6 +212,7 @@ def RowsResponseFactory(column_index): self.index = row_response['i'] self.row = [c['v'] if c else None for c in row_response['cells']] + def __getitem__(self, column): # Trailing nulls seem to be stripped from row data try: @@ -220,11 +222,14 @@ def RowsResponseFactory(column_index): def __init__(self, rows_response): self.rows_response = rows_response + def __iter__(self): for row_response in self.rows_response: yield self.RefineRow(row_response) + def __getitem__(self, index): return self.RefineRow(self.rows_response[index]) + def __len__(self): return len(self.rows_response) @@ -331,12 +336,12 @@ class RefineProject: return def apply_operations(self, file_path, wait=True): - json = open(file_path).read() - response_json = self.do_json('apply-operations', {'operations': json}) + json_data = open(file_path).read() + response_json = self.do_json('apply-operations', {'operations': json_data}) if response_json['code'] == 'pending' and wait: self.wait_until_idle() return 'ok' - return response_json['code'] # can be 'ok' or 'pending' + return response_json['code'] # can be 'ok' or 'pending' def export(self, export_format='tsv'): """Return a fileobject of a project's data.""" @@ -426,6 +431,7 @@ class RefineProject: }, }, } + def compute_clusters(self, column, clusterer_type='binning', function=None, params=None): """Returns a list of clusters of {'value': ..., 'count': ...}.""" @@ -443,7 +449,7 @@ class RefineProject: def annotate_one_row(self, row, annotation, state=True): if annotation not in ('starred', 'flagged'): raise ValueError('annotation must be one of starred or flagged') - state = 'true' if state == True else 'false' + state = 'true' if state is True else 'false' return self.do_json('annotate-one-row', {'row': row.index, annotation: state}) @@ -457,18 +463,19 @@ class RefineProject: column_insert_index=None, on_error='set-to-blank'): if column_insert_index is None: column_insert_index = self.column_order[column] + 1 - response = self.do_json('add-column', {'baseColumnName': column, - 'newColumnName': new_column, 'expression': expression, - 'columnInsertIndex': column_insert_index, 'onError': on_error}) + response = self.do_json('add-column', { + 'baseColumnName': column, 'newColumnName': new_column, + 'expression': expression, 'columnInsertIndex': column_insert_index, + 'onError': on_error}) self.get_models() return response def split_column(self, column, separator=',', mode='separator', regex=False, guess_cell_type=True, remove_original_column=True): - response = self.do_json('split-column', {'columnName': column, - 'separator': separator, 'mode': mode, 'regex': regex, - 'guessCellType': guess_cell_type, + response = self.do_json('split-column', { + 'columnName': column, 'separator': separator, 'mode': mode, + 'regex': regex, 'guessCellType': guess_cell_type, 'removeOriginalColumn': remove_original_column}) self.get_models() return response @@ -505,9 +512,11 @@ class RefineProject: self.get_models() return response - def transpose_columns_into_rows(self, start_column, column_count, - combined_column_name, separator=':', prepend_column_name=True, - ignore_blank_cells=True): + def transpose_columns_into_rows( + self, start_column, column_count, + combined_column_name, separator=':', prepend_column_name=True, + ignore_blank_cells=True): + response = self.do_json('transpose-columns-into-rows', { 'startColumnName': start_column, 'columnCount': column_count, 'combinedColumnName': combined_column_name, @@ -550,7 +559,8 @@ class RefineProject: return recon_service return None - def reconcile(self, column, service, type=None, config=None): + def reconcile(self, column, service, reconciliation_type=None, + reconciliation_config=None): """Perform a reconciliation asynchronously. config: { @@ -570,21 +580,21 @@ class RefineProject: for reconciliation to complete. """ # Create a reconciliation config by looking up recon service info - if config is None: + if reconciliation_config is None: service = self.get_reconciliation_service_by_name_or_url(service) - if type is None: + if reconciliation_type is None: raise ValueError('Must have at least one of config or type') - config = { + reconciliation_config = { 'mode': 'standard-service', 'service': service['url'], 'identifierSpace': service['identifierSpace'], 'schemaSpace': service['schemaSpace'], 'type': { - 'id': type['id'], - 'name': type['name'], + 'id': reconciliation_type['id'], + 'name': reconciliation_type['name'], }, 'autoMatch': True, 'columnDetails': [], } return self.do_json('reconcile', { - 'columnName': column, 'config': json.dumps(config)}) + 'columnName': column, 'config': json.dumps(reconciliation_config)}) diff --git a/refine.py b/refine.py index ffc832b..0e8593b 100755 --- a/refine.py +++ b/refine.py @@ -50,16 +50,19 @@ PARSER.add_option('-E', '--export', dest='export', action='store_true', PARSER.add_option('-f', '--apply', dest='apply', help='Apply a JSON commands file to a project') + def list_projects(): """Query the Refine server and list projects by ID: name.""" projects = refine.Refine(refine.RefineServer()).list_projects().items() + def date_to_epoch(json_dt): - "Convert a JSON date time into seconds-since-epoch." + """Convert a JSON date time into seconds-since-epoch.""" return time.mktime(time.strptime(json_dt, '%Y-%m-%dT%H:%M:%SZ')) projects.sort(key=lambda v: date_to_epoch(v[1]['modified']), reverse=True) for project_id, project_info in projects: print('{0:>14}: {1}'.format(project_id, project_info['name'])) + def export_project(project, options): """Dump a project to stdout or options.output file.""" export_format = 'tsv' @@ -73,8 +76,10 @@ def export_project(project, options): output.writelines(project.export(export_format=export_format)) output.close() + +#noinspection PyPep8Naming def main(): - "Main." + """Main.""" options, args = PARSER.parse_args() if options.host: @@ -100,4 +105,4 @@ def main(): if __name__ == '__main__': # return project so that it's available interactively, python -i refine.py - project = main() + refine_project = main() diff --git a/requirements.txt b/requirements.txt index 79b61df..f02ab12 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -urllib2-file>=0.2.1 \ No newline at end of file +urllib2_file>=0.2.1 \ No newline at end of file diff --git a/setup.py b/setup.py index 974d410..190b947 100644 --- a/setup.py +++ b/setup.py @@ -20,8 +20,9 @@ import os from setuptools import setup from setuptools import find_packages -def read(fname): - return open(os.path.join(os.path.dirname(__file__), fname)).read() + +def read(filename): + return open(os.path.join(os.path.dirname(__file__), filename)).read() setup(name='refine-client', version='0.2.1', @@ -35,13 +36,13 @@ setup(name='refine-client', install_requires=['urllib2_file'], platforms=['Any'], classifiers = [ - 'Development Status :: 3 - Alpha', - 'Intended Audience :: Developers', - 'License :: OSI Approved :: GNU General Public License (GPL)', - 'Operating System :: OS Independent', - 'Programming Language :: Python', - 'Topic :: Software Development :: Libraries :: Python Modules', - 'Topic :: Text Processing', + 'Development Status :: 3 - Alpha', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: GNU General Public License (GPL)', + 'Operating System :: OS Independent', + 'Programming Language :: Python', + 'Topic :: Software Development :: Libraries :: Python Modules', + 'Topic :: Text Processing', ], test_suite='tests', ) diff --git a/tests/refinetest.py b/tests/refinetest.py index fa5f4aa..6495f29 100644 --- a/tests/refinetest.py +++ b/tests/refinetest.py @@ -20,6 +20,7 @@ from google.refine import refine PATH_TO_TEST_DATA = os.path.join('tests', 'data') +#noinspection PyPep8Naming class RefineTestCase(unittest.TestCase): project_file = None project_file_options = {} @@ -42,6 +43,7 @@ class RefineTestCase(unittest.TestCase): self.project = None def assertInResponse(self, expect): + desc = None try: desc = self.project.history_entry.description self.assertTrue(expect in desc) diff --git a/tests/test_history.py b/tests/test_history.py index d565836..044a5f2 100644 --- a/tests/test_history.py +++ b/tests/test_history.py @@ -13,11 +13,11 @@ from google.refine.history import * class HistoryTest(unittest.TestCase): def test_init(self): response = { - u"code":"ok", + u"code": "ok", u"historyEntry": { - u"id":1303851435223, - u"description":"Split 4 cells", - u"time":"2011-04-26T16:45:08Z" + u"id": 1303851435223, + u"description": "Split 4 cells", + u"time": "2011-04-26T16:45:08Z" } } he = response['historyEntry'] diff --git a/tests/test_refine_small.py b/tests/test_refine_small.py index 4559b44..c525ba5 100644 --- a/tests/test_refine_small.py +++ b/tests/test_refine_small.py @@ -46,7 +46,7 @@ class RefineProjectTest(unittest.TestCase): def setUp(self): # Mock out get_models so it doesn't attempt to connect to a server self._get_models = refine.RefineProject.get_models - refine.RefineProject.get_models = lambda self: self + refine.RefineProject.get_models = lambda me: me # Save REFINE_{HOST,PORT} as tests overwrite it self._refine_host_port = refine.REFINE_HOST, refine.REFINE_PORT refine.REFINE_HOST, refine.REFINE_PORT = '127.0.0.1', '3333' @@ -65,8 +65,8 @@ class RefineProjectTest(unittest.TestCase): p = RP('1658955153749') self.assertEqual(p.server.server, 'http://127.0.0.1:3333') self.assertEqual(p.project_id, '1658955153749') - refine.REFINE_HOST='10.0.0.1' - refine.REFINE_PORT='80' + refine.REFINE_HOST = '10.0.0.1' + refine.REFINE_PORT = '80' p = RP('1658955153749') self.assertEqual(p.server.server, 'http://10.0.0.1') diff --git a/tests/test_tutorial.py b/tests/test_tutorial.py index 7018404..abc57fa 100644 --- a/tests/test_tutorial.py +++ b/tests/test_tutorial.py @@ -107,7 +107,8 @@ class TutorialTestFacets(refinetest.RefineTestCase): self.assertEqual(p.expression, 'value[0, 3]') self.assertEqual(p.choices['318'].count, 2331) # {16} - commissioned_date_facet = facet.NumericFacet('Commissioned Date', + commissioned_date_facet = facet.NumericFacet( + 'Commissioned Date', expression='value.toDate().datePart("year")') self.project.engine.add_facet(commissioned_date_facet) response = self.project.compute_facets() @@ -115,7 +116,8 @@ class TutorialTestFacets(refinetest.RefineTestCase): self.assertEqual(cd.error_count, 959) self.assertEqual(cd.numeric_count, 5999) # {17} - office_description_facet = facet.NumericFacet('Office Description', + office_description_facet = facet.NumericFacet( + 'Office Description', expression=r'value.match(/\D*(\d+)\w\w Rep.*/)[0].toNumber()') self.project.engine.add_facet(office_description_facet) response = self.project.compute_facets() @@ -212,8 +214,8 @@ class TutorialTestDuplicateDetection(refinetest.RefineTestCase): indexes = [row.index for row in response.rows] self.assertEqual(indexes, range(10)) # {10} - self.project.add_column('email', 'count', - 'facetCount(value, "value", "email")') + self.project.add_column( + 'email', 'count', 'facetCount(value, "value", "email")') self.assertInResponse('column email by filling 10 rows') response = self.project.get_rows() self.assertEqual(self.project.column_order['email'], 0) # i.e. 1st @@ -258,8 +260,8 @@ class TutorialTestTransposeColumnsIntoRows(refinetest.RefineTestCase): self.project.add_column('pair', 'year', 'value[2,6].toNumber()') self.assertInResponse('filling 26185 rows') # {5} - self.project.text_transform(column='pair', - expression='value.substring(7).toNumber()') + self.project.text_transform( + column='pair', expression='value.substring(7).toNumber()') self.assertInResponse('transform on 26185 cells') # {6} self.project.rename_column('pair', 'amount') @@ -274,15 +276,16 @@ class TutorialTestTransposeColumnsIntoRows(refinetest.RefineTestCase): row10 = response.rows[9] self.assertEqual(row10['country_name'], 'Afghanistan') self.assertEqual(row10['program_name'], - 'Department of Defense Security Assistance') + 'Department of Defense Security Assistance') self.assertEqual(row10['amount'], 113777303) -class TutorialTestTransposeFixedNumbeOfRowsIntoColumns( - refinetest.RefineTestCase): +class TutorialTestTransposeFixedNumberOfRowsIntoColumns( + refinetest.RefineTestCase): project_file = 'fixed-rows.csv' project_file_options = {'split_into_columns': False, 'header_lines': 0} + def test_transpose_fixed_number_of_rows_into_columns(self): # Section "5. Structural Editing, # Transpose Fixed Number of Rows into Columns" @@ -293,7 +296,8 @@ class TutorialTestTransposeFixedNumbeOfRowsIntoColumns( self.assertInResponse('Transpose every 4 cells in column Column') # {9} - renaming column triggers a bug in Refine # {10} - self.project.add_column('Column 1', 'Transaction', + self.project.add_column( + 'Column 1', 'Transaction', 'if(value.contains(" sent "), "send", "receive")') self.assertInResponse('Column 1 by filling 4 rows') # {11} @@ -302,17 +306,20 @@ class TutorialTestTransposeFixedNumbeOfRowsIntoColumns( self.project.engine.add_facet(transaction_facet) self.project.compute_facets() # {12}, {13}, {14} - self.project.add_column('Column 1', 'Sender', + self.project.add_column( + 'Column 1', 'Sender', 'value.partition(" sent ")[0]') # XXX resetting the facet shows data in rows with Transaction=receive # which shouldn't have been possible with the facet. - self.project.add_column('Column 1', 'Recipient', - 'value.partition(" to ")[2].partition(" on ")[0]') - self.project.add_column('Column 1', 'Amount', - 'value.partition(" sent ")[2].partition(" to ")[0]') + self.project.add_column( + 'Column 1', 'Recipient', + 'value.partition(" to ")[2].partition(" on ")[0]') + self.project.add_column( + 'Column 1', 'Amount', + 'value.partition(" sent ")[2].partition(" to ")[0]') # {15} transaction_facet.reset().include('receive') - response = self.project.get_rows() + self.project.get_rows() # XXX there seems to be some kind of bug where the model doesn't # match get_rows() output - cellIndex being returned that are # out of range. @@ -322,13 +329,11 @@ class TutorialTestTransposeFixedNumbeOfRowsIntoColumns( # {16} for column, expression in ( ('Sender', - 'cells["Column 1"].value.partition(" from ")[2]' - '.partition(" on ")[0]'), + 'cells["Column 1"].value.partition(" from ")[2].partition(" on ")[0]'), ('Recipient', 'cells["Column 1"].value.partition(" received ")[0]'), ('Amount', - 'cells["Column 1"].value.partition(" received ")[2]' - '.partition(" from ")[0]') + 'cells["Column 1"].value.partition(" received ")[2].partition(" from ")[0]') ): self.project.text_transform(column, expression) self.assertInResponse('2 cells') @@ -343,21 +348,22 @@ class TutorialTestTransposeFixedNumbeOfRowsIntoColumns( self.assertInResponse('Reorder columns') -class TutorialTestTransposeVariableNumbeOfRowsIntoColumns( - refinetest.RefineTestCase): +class TutorialTestTransposeVariableNumberOfRowsIntoColumns( + refinetest.RefineTestCase): project_file = 'variable-rows.csv' project_file_options = {'split_into_columns': False, 'header_lines': 0} def test_transpose_variable_number_of_rows_into_columns(self): # {20}, {21} - self.project.add_column('Column', 'First Line', - 'if(value.contains(" on "), value, null)') + self.project.add_column( + 'Column', 'First Line', 'if(value.contains(" on "), value, null)') self.assertInResponse('Column by filling 4 rows') response = self.project.get_rows() first_names = [row['First Line'][0:10] if row['First Line'] else None for row in response.rows] - self.assertEqual(first_names, ['Tom Dalton', None, None, None, + self.assertEqual(first_names, [ + 'Tom Dalton', None, None, None, 'Morgan Law', None, None, None, None, 'Eric Batem']) # {22} self.project.move_column('First Line', 0) @@ -369,12 +375,12 @@ class TutorialTestTransposeVariableNumbeOfRowsIntoColumns( self.assertEqual(response.mode, 'record-based') self.assertEqual(response.filtered, 4) # {24} - self.project.add_column('Column', 'Status', - 'row.record.cells["Column"].value[-1]') + self.project.add_column( + 'Column', 'Status', 'row.record.cells["Column"].value[-1]') self.assertInResponse('filling 18 rows') # {25} - self.project.text_transform('Column', - 'row.record.cells["Column"].value[1, -1].join("|")') + self.project.text_transform( + 'Column', 'row.record.cells["Column"].value[1, -1].join("|")') self.assertInResponse('18 cells') # {26} self.project.engine.mode = 'row-based'