From 191d93e33fa709ea03063a9f2be7411c789d1682 Mon Sep 17 00:00:00 2001 From: Paul Makepeace Date: Mon, 25 Apr 2011 02:49:19 -0400 Subject: [PATCH] Add reorder_rows(), and supporting Sorting class. --- google/refine.py | 56 ++++++++++++++++++++++++++++++++------ google/test/test_engine.py | 25 +++++++++++++++-- google/test/test_refine.py | 21 +++++++++++++- 3 files changed, 90 insertions(+), 12 deletions(-) diff --git a/google/refine.py b/google/refine.py index 7616291..cad3530 100644 --- a/google/refine.py +++ b/google/refine.py @@ -171,6 +171,35 @@ class Engine(object): facet.reset() +class Sorting(object): + """Class representing the current sorting order for a project. + + Used in RefineProject.get_rows()""" + def __init__(self, criteria=None): + self.criteria = [] + if criteria is None: + criteria = [] + if not isinstance(criteria, list): + criteria = [criteria] + for criterion in criteria: + if isinstance(criterion, basestring): + criterion = { + 'column': criterion, + 'valueType': 'string', + 'caseSensitive': False, + } + criterion.setdefault('reverse', False) + criterion.setdefault('errorPosition', 1) + criterion.setdefault('blankPosition', 2) + self.criteria.append(criterion) + + def as_json(self): + return json.dumps({'criteria': self.criteria}) + + def __len__(self): + return len(self.criteria) + + class RefineServer(object): """Communicate with a Refine server.""" @@ -333,6 +362,7 @@ class RowsResponse(object): class RefineProject: """A Google Refine project.""" + def __init__(self, server, project_id=None, project_name=None): if not isinstance(server, RefineServer): url = urlparse.urlparse(server) @@ -353,6 +383,7 @@ class RefineProject: self.column_index = {} self.get_models() self.engine = Engine() + self.sorting = Sorting() def do_raw(self, command, data): """Issue a command to the server & return a response object.""" @@ -364,7 +395,8 @@ class RefineProject: if data is None: data = {} data['engine'] = self.engine.as_json() - return self.server.urlopen_json(command, project_id=self.project_id, data=data) + return self.server.urlopen_json(command, project_id=self.project_id, + data=data) def get_models(self): """Fill out column metadata.""" @@ -391,10 +423,9 @@ class RefineProject: def apply_operations(self, file_path, wait=True): json = open(file_path).read() response_json = self.do('apply-operations', {'operations': json}) - if response_json['code'] == 'pending': - if wait: - self.wait_until_idle() - return 'ok' + if response_json['code'] == 'pending' and wait: + self.wait_until_idle() + return 'ok' return response_json['code'] # can be 'ok' or 'pending' def export(self, export_format='tsv'): @@ -417,13 +448,22 @@ class RefineProject: response = self.do_json('compute-facets') return FacetsResponse(response) - def get_rows(self, facets=None, start=0, limit=10): + def get_rows(self, facets=None, sort_by=None, start=0, limit=10): if facets: self.engine = Engine(facets) - response = self.do_json('get-rows', { - 'sorting': "{'criteria': []}", 'start': start, 'limit': limit}) + if sort_by is not None: + self.sorting = Sorting(sort_by) + response = self.do_json('get-rows', {'sorting': self.sorting.as_json(), + 'start': start, 'limit': limit}) return RowsResponse(response) + def reorder_rows(self, sort_by=None): + if sort_by is not None: + self.sorting = Sorting(sort_by) + response = self.do_json('reorder-rows', + {'sorting': self.sorting.as_json()}) + return response + def remove_rows(self, facets=None): if facets: self.engine = Engine(facets) diff --git a/google/test/test_engine.py b/google/test/test_engine.py index e328e92..c179989 100644 --- a/google/test/test_engine.py +++ b/google/test/test_engine.py @@ -7,14 +7,15 @@ Created by Paul Makepeace on 2011-04-22. Copyright (c) 2011 Real Programmers. All rights reserved. """ -import json +import json import os import sys import unittest import urllib from google.refine import TextFacet, NumericFacet, StarredFacet, FlaggedFacet -from google.refine import Engine, FacetsResponse +from google.refine import Engine, Sorting, FacetsResponse + class FacetTest(unittest.TestCase): def test_init(self): @@ -42,6 +43,25 @@ class FacetTest(unittest.TestCase): facet = NumericFacet(column='column', From=1, to=5) self.assertEqual(facet.as_dict(), {'from': 1, 'to': 5, 'selectBlank': True, 'name': 'column', 'selectError': True, 'expression': 'value', 'selectNumeric': True, 'columnName': 'column', 'selectNonNumeric': True, 'type': 'range'}) + def test_sorting(self): + sorting = Sorting() + self.assertEqual(sorting.as_json(), '{"criteria": []}') + sorting = Sorting('email') + c = sorting.criteria[0] + self.assertEqual(c['column'], 'email') + self.assertEqual(c['valueType'], 'string') + self.assertEqual(c['reverse'], False) + self.assertEqual(c['caseSensitive'], False) + self.assertEqual(c['errorPosition'], 1) + self.assertEqual(c['blankPosition'], 2) + sorting = Sorting(['email', 'gender']) + self.assertEqual(len(sorting), 2) + sorting = Sorting(['email', {'column': 'date', 'valueType': 'date'}]) + self.assertEqual(len(sorting), 2) + c = sorting.criteria[1] + self.assertEqual(c['column'], 'date') + self.assertEqual(c['valueType'], 'date') + def test_add_facet(self): facet = TextFacet(column='Party Code') engine = Engine(facet) @@ -74,7 +94,6 @@ class FacetTest(unittest.TestCase): engine.remove_all() self.assertEqual(len(engine), 0) - def test_facets_response(self): response = """{"facets":[{"name":"Party Code","expression":"value","columnName":"Party Code","invert":false,"choices":[{"v":{"v":"D","l":"D"},"c":3700,"s":false},{"v":{"v":"R","l":"R"},"c":1613,"s":false},{"v":{"v":"N","l":"N"},"c":15,"s":false},{"v":{"v":"O","l":"O"},"c":184,"s":false}],"blankChoice":{"s":false,"c":1446}}],"mode":"row-based"}""" response = FacetsResponse(json.loads(response)) diff --git a/google/test/test_refine.py b/google/test/test_refine.py index 1e28490..784525f 100644 --- a/google/test/test_refine.py +++ b/google/test/test_refine.py @@ -235,7 +235,7 @@ class TutorialTestEditing(RefineTestCase): response = self.project.compute_facets() self.assertEqual(len(response.facets[0].choices), 65) - # Section "4. Row and Column Editing" + # Section "4. Row and Column Editing, Batched Row Deletion" # Test doesn't strictly follow the tutorial as the "Browse this # cluster" performs a text facet which the server can't complete # as it busts its max facet count. The useful work is done with @@ -261,5 +261,24 @@ class TutorialTestEditing(RefineTestCase): self.assertTrue('3 rows' in response['historyEntry']['description']) +class TutorialTestDuplicateDetection(RefineTestCase): + project_file = 'duplicates.csv' + + def test_duplicate_detection(self): + # Section "4. Row and Column Editing, + # Duplicate Row Detection and Deletion" + # {7}, {8} + response = self.project.get_rows(sort_by='email') + indexes = [r.index for r in response.rows] + self.assertEqual(indexes, [4, 9, 8, 3, 0, 2, 5, 6, 1, 7]) + # {9} + response = self.project.reorder_rows() + self.assertEqual('Reorder rows', + response['historyEntry']['description']) + response = self.project.get_rows(sort_by='email') + indexes = [r.index for r in response.rows] + self.assertEqual(indexes, range(10)) + + if __name__ == '__main__': unittest.main() \ No newline at end of file